/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
* The System Duty Cycle (SDC) scheduling class
* --------------------------------------------
*
* Background
*
* Kernel threads in Solaris have traditionally not been large consumers
* of CPU time. They typically wake up, perform a small amount of
* work, then go back to sleep waiting for either a timeout or another
* signal. On the assumption that the small amount of work that they do
* is important for the behavior of the whole system, these threads are
* treated kindly by the dispatcher and the SYS scheduling class: they run
* without preemption from anything other than real-time and interrupt
* threads; when preempted, they are put at the front of the queue, so they
* generally do not migrate between CPUs; and they are allowed to stay
* running until they voluntarily give up the CPU.
*
* As Solaris has evolved, new workloads have emerged which require the
* kernel to perform significant amounts of CPU-intensive work. One
* example of such a workload is ZFS's transaction group sync processing.
* storage. The taskq threads which perform the compression and checksums
* will run nonstop as long as they have work to do; a large sync operation
* on a compression-heavy dataset can keep them busy for seconds on end.
* This causes human-time-scale dispatch latency bubbles for any other
* threads which have the misfortune to share a CPU with the taskq threads.
*
* The SDC scheduling class is a solution to this problem.
*
*
* Overview
*
* SDC is centered around the concept of a thread's duty cycle (DC):
*
* ONPROC time
* Duty Cycle = ----------------------
* ONPROC + Runnable time
*
* This is the ratio of the time that the thread spent running on a CPU
* divided by the time it spent running or trying to run. It is unaffected
* by any time the thread spent sleeping, stopped, etc.
*
* A thread joining the SDC class specifies a "target" DC that it wants
* to run at. To implement this policy, the routine sysdc_update() scans
* the list of active SDC threads every few ticks and uses each thread's
* microstate data to compute the actual duty cycle that that thread
* has experienced recently. If the thread is under its target DC, its
* priority is increased to the maximum available (sysdc_maxpri, which is
* 99 by default). If the thread is over its target DC, its priority is
* reduced to the minimum available (sysdc_minpri, 0 by default). This
* is a fairly primitive approach, in that it doesn't use any of the
* intermediate priorities, but it's not completely inappropriate. Even
* though threads in the SDC class might take a while to do their job, they
* are by some definition important if they're running inside the kernel,
* so it is reasonable that they should get to run at priority 99.
*
* If a thread is running when sysdc_update() calculates its actual duty
* cycle, and there are other threads of equal or greater priority on its
* CPU's dispatch queue, sysdc_update() preempts that thread. The thread
* acknowledges the preemption by calling sysdc_preempt(), which calls
* setbackdq(), which gives other threads with the same priority a chance
* to run. This creates a de facto time quantum for threads in the SDC
* scheduling class.
*
* An SDC thread which is assigned priority 0 can continue to run if
* nothing else needs to use the CPU that it's running on. Similarly, an
* SDC thread at priority 99 might not get to run as much as it wants to
* if there are other priority-99 or higher threads on its CPU. These
* situations would cause the thread to get ahead of or behind its target
* DC; the longer the situations lasted, the further ahead or behind the
* thread would get. Rather than condemning a thread to a lifetime of
* paying for its youthful indiscretions, SDC keeps "base" values for
* ONPROC and Runnable times in each thread's sysdc data, and updates these
* values periodically. The duty cycle is then computed using the elapsed
* amount of ONPROC and Runnable times since those base times.
*
* Since sysdc_update() scans SDC threads fairly frequently, it tries to
* keep the list of "active" threads small by pruning out threads which
* have been asleep for a brief time. They are not pruned immediately upon
* going to sleep, since some threads may bounce back and forth between
* sleeping and being runnable.
*
*
* Interfaces
*
* void sysdc_thread_enter(t, dc, flags)
*
* Moves a kernel thread from the SYS scheduling class to the
* SDC class. t must have an associated LWP (created by calling
* lwp_kernel_create()). The thread will have a target DC of dc.
* Flags should be either 0 or SYSDC_THREAD_BATCH. If
* SYSDC_THREAD_BATCH is specified, the thread is expected to be
* doing large amounts of processing.
*
*
* Complications
*
* - Run queue balancing
*
* The Solaris dispatcher is biased towards letting a thread run
* on the same CPU which it last ran on, if no more than 3 ticks
* (i.e. rechoose_interval) have passed since the thread last ran.
* This helps to preserve cache warmth. On the other hand, it also
* tries to keep the per-CPU run queues fairly balanced; if the CPU
* chosen for a runnable thread has a run queue which is three or
* more threads longer than a neighboring CPU's queue, the runnable
* thread is dispatched onto the neighboring CPU instead.
*
* These policies work well for some workloads, but not for many SDC
* threads. The taskq client of SDC, for example, has many discrete
* units of work to do. The work units are largely independent, so
* cache warmth is not an important consideration. It is important
* that the threads fan out quickly to different CPUs, since the
* amount of work these threads have to do (a few seconds worth at a
* time) doesn't leave much time to correct thread placement errors
* (i.e. two SDC threads being dispatched to the same CPU).
*
* To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
* This tells the dispatcher to keep neighboring run queues' lengths
* more evenly matched, which allows SDC threads to migrate more
* easily.
*
* - LWPs and system processes
*
* SDC can only be used for kernel threads. Since SDC uses microstate
* accounting data to compute each thread's actual duty cycle, all
* threads entering the SDC class must have associated LWPs (which
* store the microstate data). This means that the threads have to
* be associated with an SSYS process, i.e. one created by newproc().
* If the microstate accounting information is ever moved into the
* kthread_t, this restriction could be lifted.
*
* - Dealing with oversubscription
*
* Since SDC duty cycles are per-thread, it is possible that the
* aggregate requested duty cycle of all SDC threads in a processor
* set could be greater than the total CPU time available in that set.
* The FSS scheduling class has an analogous situation, which it deals
* with by reducing each thread's allotted CPU time proportionally.
* Since SDC doesn't need to be as precise as FSS, it uses a simpler
* solution to the oversubscription problem.
*
* sysdc_update() accumulates the amount of time that max-priority SDC
* threads have spent on-CPU in each processor set, and uses that sum
* to create an implied duty cycle for that processor set:
*
* accumulated CPU time
* pset DC = -----------------------------------
* (# CPUs) * time since last update
*
* If this implied duty cycle is above a maximum pset duty cycle (90%
* by default), sysdc_update() sets the priority of all SDC threads
* in that processor set to sysdc_minpri for a "break" period. After
* the break period, it waits for a "nobreak" period before trying to
* enforce the pset duty cycle limit again.
*
* - Processor sets
*
* As the above implies, SDC is processor set aware, but it does not
* currently allow threads to change processor sets while in the SDC
* class. Instead, those threads must join the desired processor set
* before entering SDC. [1]
*
* - Batch threads
*
* A thread joining the SDC class can specify the SDC_THREAD_BATCH
* flag. This flag currently has no effect, but marks threads which
* do bulk processing.
*
* - t_kpri_req
*
* The TS and FSS scheduling classes pay attention to t_kpri_req,
* which provides a simple form of priority inheritance for
* synchronization primitives (such as rwlocks held as READER) which
* cannot be traced to a unique thread. The SDC class does not honor
* t_kpri_req, for a few reasons:
*
* 1. t_kpri_req is notoriously inaccurate. A measure of its
* inaccuracy is that it needs to be cleared every time a thread
* returns to user mode, because it is frequently non-zero at that
* point. This can happen because "ownership" of synchronization
* primitives that use t_kpri_req can be silently handed off,
* leaving no opportunity to will the t_kpri_req inheritance.
*
* 2. Unlike in TS and FSS, threads in SDC *will* eventually run at
* kernel priority. This means that even if an SDC thread
* is holding a synchronization primitive and running at low
* priority, its priority will eventually be raised above 60,
* allowing it to drive on and release the resource.
*
* 3. The first consumer of SDC uses the taskq subsystem, which holds
* a reader lock for the duration of the task's execution. This
* would mean that SDC threads would never drop below kernel
* priority in practice, which defeats one of the purposes of SDC.
*
* - Why not FSS?
*
* It might seem that the existing FSS scheduling class could solve
* the problems that SDC is attempting to solve. FSS's more precise
* solution to the oversubscription problem would hardly cause
* trouble, as long as it performed well. SDC is implemented as
* a separate scheduling class for two main reasons: the initial
* consumer of SDC does not map well onto the "project" abstraction
* that is central to FSS, and FSS does not expect to run at kernel
* priorities.
*
*
* Tunables
*
* - sysdc_update_interval_msec: Number of milliseconds between
* consecutive thread priority updates.
*
* - sysdc_reset_interval_msec: Number of milliseconds between
* consecutive resets of a thread's base ONPROC and Runnable
* times.
*
* - sysdc_prune_interval_msec: Number of milliseconds of sleeping
* before a thread is pruned from the active list.
*
* - sysdc_max_pset_DC: Allowable percentage of a processor set's
* CPU time which SDC can give to its high-priority threads.
*
* - sysdc_break_msec: Number of milliseconds of "break" taken when
* sysdc_max_pset_DC is exceeded.
*
*
* Future work (in SDC and related subsystems)
*
* - Per-thread rechoose interval (0 for SDC)
*
* Allow each thread to specify its own rechoose interval. SDC
* threads would specify an interval of zero, which would rechoose
* the CPU with the lowest priority once per update.
*
* - Allow threads to change processor sets after joining the SDC class
*
* - Thread groups and per-group DC
*
* It might be nice to be able to specify a duty cycle which applies
* to a group of threads in aggregate.
*
* - Per-group DC callback to allow dynamic DC tuning
*
* Currently, DCs are assigned when the thread joins SDC. Some
* workloads could benefit from being able to tune their DC using
* subsystem-specific knowledge about the workload.
*
* - Finer-grained priority updates
*
* - More nuanced management of oversubscription
*
* - Moving other CPU-intensive threads into SDC
*
* - Move msacct data into kthread_t
*
* This would allow kernel threads without LWPs to join SDC.
*
*
* Footnotes
*
* [1] The details of doing so are left as an exercise for the reader.
*/
#include <sys/sysdc_impl.h>
#include <sys/schedctl.h>
#include <sys/sysmacros.h>
/*
* Tunables - loaded into the internal state at module load time
*/
/*
* Internal state - constants set up by sysdc_initparam()
*/
/*
* Internal state
*/
/*
* Internal state - active hash table
*/
#ifdef DEBUG
static struct {
} sysdc_stats;
#else
#define SYSDC_INC_STAT(x) ((void)0)
#endif
/* macros are UPPER CASE */
static void
sysdc_initparam(void)
{
/* update / prune intervals */
/* We must get at least a little time on CPU. */
sysdc_minDC = 1;
sysdc_minpri = 0;
/* break parameters */
if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
}
/*
* We want:
*
* sysdc_max_pset_DC = (nobreak / (break + nobreak))
*
* ==> nobreak = sysdc_max_pset_DC * (break + nobreak)
*
* sysdc_max_pset_DC * break
* ==> nobreak = -------------------------
* 1 - sysdc_max_pset_DC
*/
sysdc_param_init = 1;
}
/*
* Updates the recorded times in the sdc, and returns the elapsed ONPROC
* and Runnable times since the last reset.
*
* newO is the thread's actual ONPROC time; it's used during sysdc_update()
* to track processor set usage.
*/
static void
{
ASSERT(THREAD_LOCK_HELD(t));
*O = *R = 0;
/* If we've been sleeping, we know we haven't had any ONPROC time. */
if (sdc->sdc_sleep_updates != 0 &&
return;
}
/*
* If this is our first update, or we've hit the reset point,
* we need to reset our base_{O,R}. Once we've updated them, we
* report O and R for the entire prior interval.
*/
if (update) {
++sdc->sdc_nupdates;
do_reset = 1;
}
if (do_reset) {
if (initial) {
/*
* Start off our cycle count somewhere in the middle,
* to keep the resets from all happening at once.
*
* 4999 is a handy prime much larger than
* sysdc_reset_updates, so that we don't run into
* trouble if the resolution is a multiple of
* sysdc_reset_updates.
*/
} else {
}
/*
* See below for rationale.
*/
}
/* compute based on the entire interval */
return;
}
/*
* If we're called from sysdc_update(), we *must* return a value
* for newO, so we always call mstate_systhread_times().
*
* Otherwise, if we've already done a pri check this tick,
* we can skip it.
*/
return;
}
/* Get the current times from the thread */
/*
* The updating of microstate accounting is not done under a
* consistent set of locks, particularly the t_waitrq field. This
* can lead to narrow windows in which we account for time in the
* wrong bucket, which on the next read will be accounted for
* correctly.
*
* If our sdc_base_* fields were affected by one of these blips, we
* throw away the old data, and pretend this tick didn't happen.
*/
return;
}
}
/*
* sysdc_compute_pri()
*
* Recomputes the priority of the thread, leaving the result in
* sdc->sdc_epri. Returns 1 if a priority update should occur
* (which will also trigger a cpu_surrender()), otherwise
* returns 0.
*/
static uint_t
{
hrtime_t O, R;
ASSERT(THREAD_LOCK_HELD(t));
/* If we have new data, recompute our priority. */
if ((O + R) != 0) {
/* Adjust our priority to move our DC closer to the target. */
else
}
/*
* If our per-pset duty cycle goes over the max, we will take a break.
* This forces all sysdc threads in the pset to minimum priority, in
* order to let everyone else have a chance at the CPU.
*/
} else {
}
/*
* For sysdc_update(), we compute the ONPROC time for high-priority
* threads, which is used to calculate the per-pset duty cycle. We
* will always tell our callers to update the thread's priority,
* since we want to force a cpu_surrender().
*
* We reset sdc_update_ticks so that sysdc_tick() will only update
* the thread's priority if our timeout is delayed by a tick or
* more.
*/
if (update) {
/* SDC threads are not allowed to change cpupart bindings. */
/* If we were at MAXPRI, account for our onproc time. */
sdc->sdc_last_base_O != 0 &&
} else {
sdc->sdc_last_O = 0;
}
return (1);
}
/*
* Like sysdc_update(), sysdc_tick() always wants to update the
* thread's priority, so that the CPU is surrendered if necessary.
* We reset sdc_update_ticks so that if the timeout continues to be
* delayed, we'll update at the regular interval.
*/
if (tick) {
return (1);
}
/*
* Otherwise, only tell our callers to update the priority if it has
* changed.
*/
}
static void
{
ASSERT(THREAD_LOCK_HELD(t));
cpu_surrender(t);
}
}
}
/*
* Add a thread onto the active list. It will only be removed by
* sysdc_update().
*/
static void
{
ASSERT(THREAD_LOCK_HELD(t));
do {
}
/*
* sysdc_update() has two jobs:
*
* 1. It updates the priorities of all active SDC threads on the system.
* 2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
*/
static void
{
int idx;
/*
* If this is our first time through, diff will be gigantic, and
* no breaks will be necessary.
*/
}
cur->sdp_onproc_time = 0;
cur->sdp_onproc_threads = 0;
}
if (*headp == &sysdc_dummy)
continue;
/* Prevent any threads from exiting while we're poking them. */
/*
* Each sdl_list contains a singly-linked list of active
* threads. Threads which become active while we are
* processing the list will be added to sdl_list. Since we
* don't want that to interfere with our own processing, we
* swap in an empty list. Any newly active threads will
* go on to this empty list. When finished, we'll put any
* such threads at the end of the processed list.
*/
while (*prevptr != &sysdc_dummy) {
/*
* If the thread has exited, move its sysdc_t onto
* freelist, to be freed later.
*/
if (t == NULL) {
continue;
}
thread_lock(t);
thread_unlock(t);
continue;
}
/*
* If the thread has been sleeping for longer
* than sysdc_prune_interval, make it inactive by
* removing it from the list.
*/
sdc->sdc_sleep_updates != 0 &&
thread_unlock(t);
continue;
}
thread_unlock(t);
}
/*
* Add our list to the bucket, putting any new entries
* added while we were working at the tail of the list.
*/
do {
}
if (cur->sdp_should_break > 0) {
continue;
}
if (cur->sdp_dont_break > 0) {
continue;
}
}
}
/*
* If there are no sysdc_psets, there can be no threads, so
* we can stop doing our timeout. Since we're holding the
* sysdc_pset_lock, no new sysdc_psets can come in, which will
* prevent anyone from racing with this and dropping our timeout
* on the floor.
*/
if (list_is_empty(&sysdc_psets)) {
redeploy = 0;
}
}
if (redeploy) {
}
}
static void
{
ASSERT(THREAD_LOCK_HELD(t));
setbackdq(t); /* give others a chance to run */
}
static void
{
thread_lock(t);
thread_unlock(t);
return;
}
cpu_surrender(t);
}
}
}
thread_unlock(t);
}
static void
{
sdc->sdc_sleep_updates = 0;
/*
* Since we're in transition, we don't want to use the
* full thread_update_pri().
*/
if (sysdc_compute_pri(sdc, 0)) {
}
}
setbackdq(t);
}
static void
{
sysdc_setrun(t);
}
static void
{
}
/*ARGSUSED*/
static int
void *bufp)
{
int start_timeout;
return (EPERM);
sdc->sdc_thread = t;
/* Assign ourselves to the appropriate pset. */
break;
}
}
}
pset->sdp_nthreads++;
start_timeout = (sysdc_update_timeout_started == 0);
/* Update t's scheduling class and priority. */
thread_lock(t);
t->t_schedflag |= TS_RUNQMATCH;
thread_unlock(t);
/* Kick off the thread timeout if we're the first one in. */
if (start_timeout) {
}
return (0);
}
static void
{
--sdp->sdp_nthreads;
if (sdp->sdp_nthreads == 0) {
} else {
}
if (freedc)
}
static void
{
}
/*ARGSUSED*/
static int
{
/* Threads cannot exit SDC once joined, except in a body bag. */
return (EPERM);
}
static void
{
/* We're exiting, so we just rejoin the SYS class. */
thread_lock(t);
(void) thread_change_pri(t, maxclsyspri, 0);
t->t_schedflag &= ~TS_RUNQMATCH;
/* Unlink the sdc from everything. */
}
/*ARGSUSED*/
static int
{
/*
* Threads cannot be created with SDC as their class; they must
* be created as SYS and then added with sysdc_thread_enter().
* Because of this restriction, sysdc_fork() should never be called.
*/
panic("sysdc cannot be forked");
return (ENOSYS);
}
/*ARGSUSED*/
static void
{
/* SDC threads are part of system processes, which never fork. */
panic("sysdc cannot be forked");
}
static pri_t
{
return (t->t_epri);
}
/*ARGSUSED*/
static pri_t
{
/* SDC threads cannot be swapped. */
return (-1);
}
/*
* Get maximum and minimum priorities enjoyed by SDC threads.
*/
static int
{
return (0);
}
/*ARGSUSED*/
static int
{
return (0); /* no class-specific info */
}
/*ARGSUSED*/
static int
{
*p = NULL;
return (ENOMEM);
}
NULL) {
return (ENOMEM);
}
*p = new;
return (0);
}
static void
sysdc_free(void *p)
{
/*
* We must have failed CL_ENTERCLASS(), so our pset should be
* there and unused.
*/
}
}
static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */
static int sysdc_einval();
static void sysdc_nullsys();
/* messages to class manager */
{
sysdc_enosys, /* admin */
sysdc_enosys, /* parmsin */
sysdc_enosys, /* parmsout */
sysdc_enosys, /* vaparmsin */
sysdc_enosys, /* vaparmsout */
},
/* operations on threads */
{
sysdc_nullsys, /* parmsget */
sysdc_enosys, /* parmsset */
sysdc_nullsys, /* stop */
sysdc_nullsys, /* active */
sysdc_nullsys, /* inactive */
sysdc_no_swap, /* swapin */
sysdc_no_swap, /* swapout */
sysdc_nullsys, /* trapret */
sysdc_einval, /* donice */
sysdc_nullsys, /* set_process_group */
sysdc_nullsys, /* yield */
sysdc_einval, /* doprio */
}
};
static int
{
return (ENOSYS);
}
static int
{
return (EINVAL);
}
static void
{
}
/*ARGSUSED*/
static pri_t
{
int idx;
}
return ((pri_t)v.v_maxsyspri);
}
"SDC",
0
};
};
};
int
_init()
{
return (mod_install(&modlinkage));
}
int
_fini()
{
return (EBUSY); /* can't unload for now */
}
int
{
}
/* --- consolidation-private interfaces --- */
void
{
}