/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
* The System Duty Cycle (SDC) scheduling class
* --------------------------------------------
*
* Background
*
* Kernel threads in Solaris have traditionally not been large consumers
* of CPU time. They typically wake up, perform a small amount of
* work, then go back to sleep waiting for either a timeout or another
* signal. On the assumption that the small amount of work that they do
* is important for the behavior of the whole system, these threads are
* treated kindly by the dispatcher and the SYS scheduling class: they run
* without preemption from anything other than real-time and interrupt
* threads; when preempted, they are put at the front of the queue, so they
* generally do not migrate between CPUs; and they are allowed to stay
* running until they voluntarily give up the CPU.
*
* As Solaris has evolved, new workloads have emerged which require the
* kernel to perform significant amounts of CPU-intensive work. One
* example of such a workload is ZFS's transaction group sync processing.
* Each sync operation generates a large batch of I/Os, and each I/O
* may need to be compressed and/or checksummed before it is written to
* storage. The taskq threads which perform the compression and checksums
* will run nonstop as long as they have work to do; a large sync operation
* on a compression-heavy dataset can keep them busy for seconds on end.
* This causes human-time-scale dispatch latency bubbles for any other
* threads which have the misfortune to share a CPU with the taskq threads.
*
* The SDC scheduling class is a solution to this problem.
*
*
* Overview
*
* SDC is centered around the concept of a thread's duty cycle (DC):
*
* ONPROC time
* Duty Cycle = ----------------------
* ONPROC + Runnable time
*
* This is the ratio of the time that the thread spent running on a CPU
* divided by the time it spent running or trying to run. It is unaffected
* by any time the thread spent sleeping, stopped, etc.
*
* A thread joining the SDC class specifies a "target" DC that it wants
* to run at. To implement this policy, the routine sysdc_update() scans
* the list of active SDC threads every few ticks and uses each thread's
* microstate data to compute the actual duty cycle that that thread
* has experienced recently. If the thread is under its target DC, its
* priority is increased to the maximum available (sysdc_maxpri, which is
* 99 by default). If the thread is over its target DC, its priority is
* reduced to the minimum available (sysdc_minpri, 0 by default). This
* is a fairly primitive approach, in that it doesn't use any of the
* intermediate priorities, but it's not completely inappropriate. Even
* though threads in the SDC class might take a while to do their job, they
* are by some definition important if they're running inside the kernel,
* so it is reasonable that they should get to run at priority 99.
*
* If a thread is running when sysdc_update() calculates its actual duty
* cycle, and there are other threads of equal or greater priority on its
* CPU's dispatch queue, sysdc_update() preempts that thread. The thread
* acknowledges the preemption by calling sysdc_preempt(), which calls
* setbackdq(), which gives other threads with the same priority a chance
* to run. This creates a de facto time quantum for threads in the SDC
* scheduling class.
*
* An SDC thread which is assigned priority 0 can continue to run if
* nothing else needs to use the CPU that it's running on. Similarly, an
* SDC thread at priority 99 might not get to run as much as it wants to
* if there are other priority-99 or higher threads on its CPU. These
* situations would cause the thread to get ahead of or behind its target
* DC; the longer the situations lasted, the further ahead or behind the
* thread would get. Rather than condemning a thread to a lifetime of
* paying for its youthful indiscretions, SDC keeps "base" values for
* ONPROC and Runnable times in each thread's sysdc data, and updates these
* values periodically. The duty cycle is then computed using the elapsed
* amount of ONPROC and Runnable times since those base times.
*
* Since sysdc_update() scans SDC threads fairly frequently, it tries to
* keep the list of "active" threads small by pruning out threads which
* have been asleep for a brief time. They are not pruned immediately upon
* going to sleep, since some threads may bounce back and forth between
* sleeping and being runnable.
*
*
* Interfaces
*
* void sysdc_thread_enter(t, dc, flags)
*
* Moves a kernel thread from the SYS scheduling class to the
* SDC class. t must have an associated LWP (created by calling
* lwp_kernel_create()). The thread will have a target DC of dc.
* Flags should be either 0 or SYSDC_THREAD_BATCH. If
* SYSDC_THREAD_BATCH is specified, the thread is expected to be
* doing large amounts of processing.
*
*
* Complications
*
* - Run queue balancing
*
* The Solaris dispatcher is biased towards letting a thread run
* on the same CPU which it last ran on, if no more than 3 ticks
* (i.e. rechoose_interval) have passed since the thread last ran.
* This helps to preserve cache warmth. On the other hand, it also
* tries to keep the per-CPU run queues fairly balanced; if the CPU
* chosen for a runnable thread has a run queue which is three or
* more threads longer than a neighboring CPU's queue, the runnable
* thread is dispatched onto the neighboring CPU instead.
*
* These policies work well for some workloads, but not for many SDC
* threads. The taskq client of SDC, for example, has many discrete
* units of work to do. The work units are largely independent, so
* cache warmth is not an important consideration. It is important
* that the threads fan out quickly to different CPUs, since the
* amount of work these threads have to do (a few seconds worth at a
* time) doesn't leave much time to correct thread placement errors
* (i.e. two SDC threads being dispatched to the same CPU).
*
* To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
* This tells the dispatcher to keep neighboring run queues' lengths
* more evenly matched, which allows SDC threads to migrate more
* easily.
*
* - LWPs and system processes
*
* SDC can only be used for kernel threads. Since SDC uses microstate
* accounting data to compute each thread's actual duty cycle, all
* threads entering the SDC class must have associated LWPs (which
* store the microstate data). This means that the threads have to
* be associated with an SSYS process, i.e. one created by newproc().
* If the microstate accounting information is ever moved into the
* kthread_t, this restriction could be lifted.
*
* - Dealing with oversubscription
*
* Since SDC duty cycles are per-thread, it is possible that the
* aggregate requested duty cycle of all SDC threads in a processor
* set could be greater than the total CPU time available in that set.
* The FSS scheduling class has an analogous situation, which it deals
* with by reducing each thread's allotted CPU time proportionally.
* Since SDC doesn't need to be as precise as FSS, it uses a simpler
* solution to the oversubscription problem.
*
* sysdc_update() accumulates the amount of time that max-priority SDC
* threads have spent on-CPU in each processor set, and uses that sum
* to create an implied duty cycle for that processor set:
*
* accumulated CPU time
* pset DC = -----------------------------------
* (# CPUs) * time since last update
*
* If this implied duty cycle is above a maximum pset duty cycle (90%
* by default), sysdc_update() sets the priority of all SDC threads
* in that processor set to sysdc_minpri for a "break" period. After
* the break period, it waits for a "nobreak" period before trying to
* enforce the pset duty cycle limit again.
*
* - Processor sets
*
* As the above implies, SDC is processor set aware, but it does not
* currently allow threads to change processor sets while in the SDC
* class. Instead, those threads must join the desired processor set
* before entering SDC. [1]
*
* - Batch threads
*
* A thread joining the SDC class can specify the SDC_THREAD_BATCH
* flag. This flag currently has no effect, but marks threads which
* do bulk processing.
*
* - t_kpri_req
*
* The TS and FSS scheduling classes pay attention to t_kpri_req,
* which provides a simple form of priority inheritance for
* synchronization primitives (such as rwlocks held as READER) which
* cannot be traced to a unique thread. The SDC class does not honor
* t_kpri_req, for a few reasons:
*
* 1. t_kpri_req is notoriously inaccurate. A measure of its
* inaccuracy is that it needs to be cleared every time a thread
* returns to user mode, because it is frequently non-zero at that
* point. This can happen because "ownership" of synchronization
* primitives that use t_kpri_req can be silently handed off,
* leaving no opportunity to will the t_kpri_req inheritance.
*
* 2. Unlike in TS and FSS, threads in SDC *will* eventually run at
* kernel priority. This means that even if an SDC thread
* is holding a synchronization primitive and running at low
* priority, its priority will eventually be raised above 60,
* allowing it to drive on and release the resource.
*
* 3. The first consumer of SDC uses the taskq subsystem, which holds
* a reader lock for the duration of the task's execution. This
* would mean that SDC threads would never drop below kernel
* priority in practice, which defeats one of the purposes of SDC.
*
* - Why not FSS?
*
* It might seem that the existing FSS scheduling class could solve
* the problems that SDC is attempting to solve. FSS's more precise
* solution to the oversubscription problem would hardly cause
* trouble, as long as it performed well. SDC is implemented as
* a separate scheduling class for two main reasons: the initial
* consumer of SDC does not map well onto the "project" abstraction
* that is central to FSS, and FSS does not expect to run at kernel
* priorities.
*
*
* Tunables
*
* - sysdc_update_interval_msec: Number of milliseconds between
* consecutive thread priority updates.
*
* - sysdc_reset_interval_msec: Number of milliseconds between
* consecutive resets of a thread's base ONPROC and Runnable
* times.
*
* - sysdc_prune_interval_msec: Number of milliseconds of sleeping
* before a thread is pruned from the active list.
*
* - sysdc_max_pset_DC: Allowable percentage of a processor set's
* CPU time which SDC can give to its high-priority threads.
*
* - sysdc_break_msec: Number of milliseconds of "break" taken when
* sysdc_max_pset_DC is exceeded.
*
*
* Future work (in SDC and related subsystems)
*
* - Per-thread rechoose interval (0 for SDC)
*
* Allow each thread to specify its own rechoose interval. SDC
* threads would specify an interval of zero, which would rechoose
* the CPU with the lowest priority once per update.
*
* - Allow threads to change processor sets after joining the SDC class
*
* - Thread groups and per-group DC
*
* It might be nice to be able to specify a duty cycle which applies
* to a group of threads in aggregate.
*
* - Per-group DC callback to allow dynamic DC tuning
*
* Currently, DCs are assigned when the thread joins SDC. Some
* workloads could benefit from being able to tune their DC using
* subsystem-specific knowledge about the workload.
*
* - Finer-grained priority updates
*
* - More nuanced management of oversubscription
*
* - Moving other CPU-intensive threads into SDC
*
* - Move msacct data into kthread_t
*
* This would allow kernel threads without LWPs to join SDC.
*
*
* Footnotes
*
* [1] The details of doing so are left as an exercise for the reader.
*/
#include <sys/types.h>
#include <sys/sysdc.h>
#include <sys/sysdc_impl.h>
#include <sys/class.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/debug.h>
#include <sys/disp.h>
#include <sys/errno.h>
#include <sys/inline.h>
#include <sys/kmem.h>
#include <sys/modctl.h>
#include <sys/schedctl.h>
#include <sys/sdt.h>
#include <sys/sunddi.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/var.h>
/*
* Tunables - loaded into the internal state at module load time
*/
uint_t sysdc_update_interval_msec = 20;
uint_t sysdc_reset_interval_msec = 400;
uint_t sysdc_prune_interval_msec = 100;
uint_t sysdc_max_pset_DC = 90;
uint_t sysdc_break_msec = 80;
/*
* Internal state - constants set up by sysdc_initparam()
*/
static clock_t sysdc_update_ticks; /* ticks between updates */
static uint_t sysdc_prune_updates; /* updates asleep before pruning */
static uint_t sysdc_reset_updates; /* # of updates before reset */
static uint_t sysdc_break_updates; /* updates to break */
static uint_t sysdc_nobreak_updates; /* updates to not check */
static uint_t sysdc_minDC; /* minimum allowed DC */
static uint_t sysdc_maxDC; /* maximum allowed DC */
static pri_t sysdc_minpri; /* minimum allowed priority */
static pri_t sysdc_maxpri; /* maximum allowed priority */
/*
* Internal state
*/
static kmutex_t sysdc_pset_lock; /* lock protecting pset data */
static list_t sysdc_psets; /* list of psets with SDC threads */
static uint_t sysdc_param_init; /* sysdc_initparam() has been called */
static uint_t sysdc_update_timeout_started; /* update timeout is active */
static hrtime_t sysdc_last_update; /* time of last sysdc_update() */
static sysdc_t sysdc_dummy; /* used to terminate active lists */
/*
* Internal state - active hash table
*/
#define SYSDC_NLISTS 8
#define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
static sysdc_list_t sysdc_active[SYSDC_NLISTS];
#define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)])
#ifdef DEBUG
static struct {
uint64_t sysdc_update_times_asleep;
uint64_t sysdc_update_times_base_ran_backwards;
uint64_t sysdc_update_times_already_done;
uint64_t sysdc_update_times_cur_ran_backwards;
uint64_t sysdc_compute_pri_breaking;
uint64_t sysdc_activate_enter;
uint64_t sysdc_update_enter;
uint64_t sysdc_update_exited;
uint64_t sysdc_update_not_sdc;
uint64_t sysdc_update_idle;
uint64_t sysdc_update_take_break;
uint64_t sysdc_update_no_psets;
uint64_t sysdc_tick_not_sdc;
uint64_t sysdc_tick_quantum_expired;
uint64_t sysdc_thread_enter_enter;
} sysdc_stats;
#define SYSDC_INC_STAT(x) (sysdc_stats.x++)
#else
#define SYSDC_INC_STAT(x) ((void)0)
#endif
/* macros are UPPER CASE */
#define HOWMANY(a, b) howmany((a), (b))
#define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick)
static void
sysdc_initparam(void)
{
uint_t sysdc_break_ticks;
/* update / prune intervals */
sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
sysdc_update_interval_msec);
sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
sysdc_update_interval_msec);
/* We must get at least a little time on CPU. */
sysdc_minDC = 1;
sysdc_maxDC = SYSDC_DC_MAX;
sysdc_minpri = 0;
sysdc_maxpri = maxclsyspri - 1;
/* break parameters */
if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
sysdc_max_pset_DC = SYSDC_DC_MAX;
}
sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
/*
* We want:
*
* sysdc_max_pset_DC = (nobreak / (break + nobreak))
*
* ==> nobreak = sysdc_max_pset_DC * (break + nobreak)
*
* sysdc_max_pset_DC * break
* ==> nobreak = -------------------------
* 1 - sysdc_max_pset_DC
*/
sysdc_nobreak_updates =
HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
(SYSDC_DC_MAX - sysdc_max_pset_DC));
sysdc_param_init = 1;
}
#undef HOWMANY
#undef MSECTOTICKS
#define SDC_UPDATE_INITIAL 0x1 /* for the initial update */
#define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */
#define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */
/*
* Updates the recorded times in the sdc, and returns the elapsed ONPROC
* and Runnable times since the last reset.
*
* newO is the thread's actual ONPROC time; it's used during sysdc_update()
* to track processor set usage.
*/
static void
sysdc_update_times(sysdc_t *sdc, uint_t flags,
hrtime_t *O, hrtime_t *R, hrtime_t *newO)
{
kthread_t *const t = sdc->sdc_thread;
const uint_t initial = (flags & SDC_UPDATE_INITIAL);
const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
const clock_t now = ddi_get_lbolt();
uint_t do_reset;
ASSERT(THREAD_LOCK_HELD(t));
*O = *R = 0;
/* If we've been sleeping, we know we haven't had any ONPROC time. */
if (sdc->sdc_sleep_updates != 0 &&
sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
*newO = sdc->sdc_last_base_O;
SYSDC_INC_STAT(sysdc_update_times_asleep);
return;
}
/*
* If this is our first update, or we've hit the reset point,
* we need to reset our base_{O,R}. Once we've updated them, we
* report O and R for the entire prior interval.
*/
do_reset = initial;
if (update) {
++sdc->sdc_nupdates;
if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
do_reset = 1;
}
if (do_reset) {
hrtime_t baseO, baseR;
if (initial) {
/*
* Start off our cycle count somewhere in the middle,
* to keep the resets from all happening at once.
*
* 4999 is a handy prime much larger than
* sysdc_reset_updates, so that we don't run into
* trouble if the resolution is a multiple of
* sysdc_reset_updates.
*/
sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
sysdc_reset_updates);
baseO = baseR = 0;
} else {
baseO = sdc->sdc_base_O;
baseR = sdc->sdc_base_R;
}
mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
*newO = sdc->sdc_base_O;
sdc->sdc_reset = now;
sdc->sdc_pri_check = -1; /* force mismatch below */
/*
* See below for rationale.
*/
if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
baseO = sdc->sdc_base_O;
baseR = sdc->sdc_base_R;
}
/* compute based on the entire interval */
*O = (sdc->sdc_base_O - baseO);
*R = (sdc->sdc_base_R - baseR);
return;
}
/*
* If we're called from sysdc_update(), we *must* return a value
* for newO, so we always call mstate_systhread_times().
*
* Otherwise, if we've already done a pri check this tick,
* we can skip it.
*/
if (!update && sdc->sdc_pri_check == now) {
SYSDC_INC_STAT(sysdc_update_times_already_done);
return;
}
/* Get the current times from the thread */
sdc->sdc_pri_check = now;
mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
*newO = sdc->sdc_cur_O;
/*
* The updating of microstate accounting is not done under a
* consistent set of locks, particularly the t_waitrq field. This
* can lead to narrow windows in which we account for time in the
* wrong bucket, which on the next read will be accounted for
* correctly.
*
* If our sdc_base_* fields were affected by one of these blips, we
* throw away the old data, and pretend this tick didn't happen.
*/
if (sdc->sdc_cur_O < sdc->sdc_base_O ||
sdc->sdc_cur_R < sdc->sdc_base_R) {
sdc->sdc_base_O = sdc->sdc_cur_O;
sdc->sdc_base_R = sdc->sdc_cur_R;
SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
return;
}
*O = sdc->sdc_cur_O - sdc->sdc_base_O;
*R = sdc->sdc_cur_R - sdc->sdc_base_R;
}
/*
* sysdc_compute_pri()
*
* Recomputes the priority of the thread, leaving the result in
* sdc->sdc_epri. Returns 1 if a priority update should occur
* (which will also trigger a cpu_surrender()), otherwise
* returns 0.
*/
static uint_t
sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
{
kthread_t *const t = sdc->sdc_thread;
const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
const uint_t tick = (flags & SDC_UPDATE_TICK);
hrtime_t O, R;
hrtime_t newO = -1;
ASSERT(THREAD_LOCK_HELD(t));
sysdc_update_times(sdc, flags, &O, &R, &newO);
ASSERT(!update || newO != -1);
/* If we have new data, recompute our priority. */
if ((O + R) != 0) {
sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
/* Adjust our priority to move our DC closer to the target. */
if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
sdc->sdc_pri = sdc->sdc_maxpri;
else
sdc->sdc_pri = sdc->sdc_minpri;
}
/*
* If our per-pset duty cycle goes over the max, we will take a break.
* This forces all sysdc threads in the pset to minimum priority, in
* order to let everyone else have a chance at the CPU.
*/
if (sdc->sdc_pset->sdp_need_break) {
SYSDC_INC_STAT(sysdc_compute_pri_breaking);
sdc->sdc_epri = sdc->sdc_minpri;
} else {
sdc->sdc_epri = sdc->sdc_pri;
}
DTRACE_PROBE4(sysdc__compute__pri,
kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
uint_t, sdc->sdc_target_DC);
/*
* For sysdc_update(), we compute the ONPROC time for high-priority
* threads, which is used to calculate the per-pset duty cycle. We
* will always tell our callers to update the thread's priority,
* since we want to force a cpu_surrender().
*
* We reset sdc_update_ticks so that sysdc_tick() will only update
* the thread's priority if our timeout is delayed by a tick or
* more.
*/
if (update) {
/* SDC threads are not allowed to change cpupart bindings. */
ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
/* If we were at MAXPRI, account for our onproc time. */
if (t->t_pri == sdc->sdc_maxpri &&
sdc->sdc_last_base_O != 0 &&
sdc->sdc_last_base_O < newO) {
sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
sdc->sdc_pset->sdp_onproc_time +=
(uint64_t)sdc->sdc_last_O;
sdc->sdc_pset->sdp_onproc_threads++;
} else {
sdc->sdc_last_O = 0;
}
sdc->sdc_last_base_O = newO;
sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
return (1);
}
/*
* Like sysdc_update(), sysdc_tick() always wants to update the
* thread's priority, so that the CPU is surrendered if necessary.
* We reset sdc_update_ticks so that if the timeout continues to be
* delayed, we'll update at the regular interval.
*/
if (tick) {
ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
return (1);
}
/*
* Otherwise, only tell our callers to update the priority if it has
* changed.
*/
return (sdc->sdc_epri != t->t_pri);
}
static void
sysdc_update_pri(sysdc_t *sdc, uint_t flags)
{
kthread_t *t = sdc->sdc_thread;
ASSERT(THREAD_LOCK_HELD(t));
if (sysdc_compute_pri(sdc, flags)) {
if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
cpu_surrender(t);
}
}
}
/*
* Add a thread onto the active list. It will only be removed by
* sysdc_update().
*/
static void
sysdc_activate(sysdc_t *sdc)
{
sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
sysdc_t *head;
kthread_t *t = sdc->sdc_thread;
SYSDC_INC_STAT(sysdc_activate_enter);
ASSERT(sdc->sdc_next == NULL);
ASSERT(THREAD_LOCK_HELD(t));
do {
head = *headp;
sdc->sdc_next = head;
} while (atomic_cas_ptr(headp, head, sdc) != head);
}
/*
* sysdc_update() has two jobs:
*
* 1. It updates the priorities of all active SDC threads on the system.
* 2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
*/
static void
sysdc_update(void *arg)
{
int idx;
sysdc_t *freelist = NULL;
sysdc_pset_t *cur;
hrtime_t now, diff;
uint_t redeploy = 1;
SYSDC_INC_STAT(sysdc_update_enter);
ASSERT(sysdc_update_timeout_started);
/*
* If this is our first time through, diff will be gigantic, and
* no breaks will be necessary.
*/
now = gethrtime();
diff = now - sysdc_last_update;
sysdc_last_update = now;
mutex_enter(&sysdc_pset_lock);
for (cur = list_head(&sysdc_psets); cur != NULL;
cur = list_next(&sysdc_psets, cur)) {
boolean_t breaking = (cur->sdp_should_break != 0);
if (cur->sdp_need_break != breaking) {
DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
boolean_t, breaking);
}
cur->sdp_onproc_time = 0;
cur->sdp_onproc_threads = 0;
cur->sdp_need_break = breaking;
}
mutex_exit(&sysdc_pset_lock);
for (idx = 0; idx < SYSDC_NLISTS; idx++) {
sysdc_list_t *sdl = &sysdc_active[idx];
sysdc_t *volatile *headp = &sdl->sdl_list;
sysdc_t *head, *tail;
sysdc_t **prevptr;
if (*headp == &sysdc_dummy)
continue;
/* Prevent any threads from exiting while we're poking them. */
mutex_enter(&sdl->sdl_lock);
/*
* Each sdl_list contains a singly-linked list of active
* threads. Threads which become active while we are
* processing the list will be added to sdl_list. Since we
* don't want that to interfere with our own processing, we
* swap in an empty list. Any newly active threads will
* go on to this empty list. When finished, we'll put any
* such threads at the end of the processed list.
*/
head = atomic_swap_ptr(headp, &sysdc_dummy);
prevptr = &head;
while (*prevptr != &sysdc_dummy) {
sysdc_t *const sdc = *prevptr;
kthread_t *const t = sdc->sdc_thread;
/*
* If the thread has exited, move its sysdc_t onto
* freelist, to be freed later.
*/
if (t == NULL) {
*prevptr = sdc->sdc_next;
SYSDC_INC_STAT(sysdc_update_exited);
sdc->sdc_next = freelist;
freelist = sdc;
continue;
}
thread_lock(t);
if (t->t_cid != sysdccid) {
thread_unlock(t);
prevptr = &sdc->sdc_next;
SYSDC_INC_STAT(sysdc_update_not_sdc);
continue;
}
ASSERT(t->t_cldata == sdc);
/*
* If the thread has been sleeping for longer
* than sysdc_prune_interval, make it inactive by
* removing it from the list.
*/
if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
sdc->sdc_sleep_updates != 0 &&
(sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
sysdc_prune_updates) {
*prevptr = sdc->sdc_next;
SYSDC_INC_STAT(sysdc_update_idle);
sdc->sdc_next = NULL;
thread_unlock(t);
continue;
}
sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
thread_unlock(t);
prevptr = &sdc->sdc_next;
}
/*
* Add our list to the bucket, putting any new entries
* added while we were working at the tail of the list.
*/
do {
tail = *headp;
*prevptr = tail;
} while (atomic_cas_ptr(headp, tail, head) != tail);
mutex_exit(&sdl->sdl_lock);
}
mutex_enter(&sysdc_pset_lock);
for (cur = list_head(&sysdc_psets); cur != NULL;
cur = list_next(&sysdc_psets, cur)) {
cur->sdp_vtime_last_interval =
diff * cur->sdp_cpupart->cp_ncpus;
cur->sdp_DC_last_interval =
(cur->sdp_onproc_time * SYSDC_DC_MAX) /
cur->sdp_vtime_last_interval;
if (cur->sdp_should_break > 0) {
cur->sdp_should_break--; /* breaking */
continue;
}
if (cur->sdp_dont_break > 0) {
cur->sdp_dont_break--; /* waiting before checking */
continue;
}
if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
cur->sdp_should_break = sysdc_break_updates;
cur->sdp_dont_break = sysdc_nobreak_updates;
SYSDC_INC_STAT(sysdc_update_take_break);
}
}
/*
* If there are no sysdc_psets, there can be no threads, so
* we can stop doing our timeout. Since we're holding the
* sysdc_pset_lock, no new sysdc_psets can come in, which will
* prevent anyone from racing with this and dropping our timeout
* on the floor.
*/
if (list_is_empty(&sysdc_psets)) {
SYSDC_INC_STAT(sysdc_update_no_psets);
ASSERT(sysdc_update_timeout_started);
sysdc_update_timeout_started = 0;
redeploy = 0;
}
mutex_exit(&sysdc_pset_lock);
while (freelist != NULL) {
sysdc_t *cur = freelist;
freelist = cur->sdc_next;
kmem_free(cur, sizeof (*cur));
}
if (redeploy) {
(void) timeout(sysdc_update, arg, sysdc_update_ticks);
}
}
static void
sysdc_preempt(kthread_t *t)
{
ASSERT(t == curthread);
ASSERT(THREAD_LOCK_HELD(t));
setbackdq(t); /* give others a chance to run */
}
static void
sysdc_tick(kthread_t *t)
{
sysdc_t *sdc;
thread_lock(t);
if (t->t_cid != sysdccid) {
SYSDC_INC_STAT(sysdc_tick_not_sdc);
thread_unlock(t);
return;
}
sdc = t->t_cldata;
if (t->t_state == TS_ONPROC &&
t->t_pri < t->t_disp_queue->disp_maxrunpri) {
cpu_surrender(t);
}
if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
ASSERT(sdc->sdc_sleep_updates == 0);
}
ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
sdc->sdc_ticks++;
if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
SYSDC_INC_STAT(sysdc_tick_quantum_expired);
sysdc_update_pri(sdc, SDC_UPDATE_TICK);
ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
}
thread_unlock(t);
}
static void
sysdc_setrun(kthread_t *t)
{
sysdc_t *sdc = t->t_cldata;
ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
sdc->sdc_sleep_updates = 0;
if (sdc->sdc_next == NULL) {
/*
* Since we're in transition, we don't want to use the
* full thread_update_pri().
*/
if (sysdc_compute_pri(sdc, 0)) {
THREAD_CHANGE_PRI(t, sdc->sdc_epri);
}
sysdc_activate(sdc);
ASSERT(sdc->sdc_next != NULL);
}
setbackdq(t);
}
static void
sysdc_wakeup(kthread_t *t)
{
sysdc_setrun(t);
}
static void
sysdc_sleep(kthread_t *t)
{
sysdc_t *sdc = t->t_cldata;
ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
sdc->sdc_sleep_updates = sdc->sdc_nupdates;
}
/*ARGSUSED*/
static int
sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
void *bufp)
{
cpupart_t *const cpupart = t->t_cpupart;
sysdc_t *sdc = bufp;
sysdc_params_t *sdpp = parmsp;
sysdc_pset_t *newpset = sdc->sdc_pset;
sysdc_pset_t *pset;
int start_timeout;
if (t->t_cid != syscid)
return (EPERM);
ASSERT(ttolwp(t) != NULL);
ASSERT(sdpp != NULL);
ASSERT(newpset != NULL);
ASSERT(sysdc_param_init);
ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
ASSERT(sdpp->sdp_DC >= sysdc_minDC);
ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
sdc->sdc_thread = t;
sdc->sdc_pri = sdpp->sdp_maxpri; /* start off maximally */
sdc->sdc_minpri = sdpp->sdp_minpri;
sdc->sdc_maxpri = sdpp->sdp_maxpri;
sdc->sdc_target_DC = sdpp->sdp_DC;
sdc->sdc_ticks = 0;
sdc->sdc_update_ticks = sysdc_update_ticks + 1;
/* Assign ourselves to the appropriate pset. */
sdc->sdc_pset = NULL;
mutex_enter(&sysdc_pset_lock);
for (pset = list_head(&sysdc_psets); pset != NULL;
pset = list_next(&sysdc_psets, pset)) {
if (pset->sdp_cpupart == cpupart) {
break;
}
}
if (pset == NULL) {
pset = newpset;
newpset = NULL;
pset->sdp_cpupart = cpupart;
list_insert_tail(&sysdc_psets, pset);
}
pset->sdp_nthreads++;
ASSERT(pset->sdp_nthreads > 0);
sdc->sdc_pset = pset;
start_timeout = (sysdc_update_timeout_started == 0);
sysdc_update_timeout_started = 1;
mutex_exit(&sysdc_pset_lock);
if (newpset != NULL)
kmem_free(newpset, sizeof (*newpset));
/* Update t's scheduling class and priority. */
thread_lock(t);
t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
t->t_cid = cid;
t->t_cldata = sdc;
t->t_schedflag |= TS_RUNQMATCH;
sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
thread_unlock(t);
/* Kick off the thread timeout if we're the first one in. */
if (start_timeout) {
(void) timeout(sysdc_update, NULL, sysdc_update_ticks);
}
return (0);
}
static void
sysdc_leave(sysdc_t *sdc)
{
sysdc_pset_t *sdp = sdc->sdc_pset;
sysdc_list_t *sdl = SYSDC_LIST(sdc);
uint_t freedc;
mutex_enter(&sdl->sdl_lock); /* block sysdc_update() */
sdc->sdc_thread = NULL;
freedc = (sdc->sdc_next == NULL);
mutex_exit(&sdl->sdl_lock);
mutex_enter(&sysdc_pset_lock);
ASSERT(sdp != NULL);
ASSERT(sdp->sdp_nthreads > 0);
--sdp->sdp_nthreads;
if (sdp->sdp_nthreads == 0) {
list_remove(&sysdc_psets, sdp);
} else {
sdp = NULL;
}
mutex_exit(&sysdc_pset_lock);
if (freedc)
kmem_free(sdc, sizeof (*sdc));
if (sdp != NULL)
kmem_free(sdp, sizeof (*sdp));
}
static void
sysdc_exitclass(void *buf)
{
sysdc_leave((sysdc_t *)buf);
}
/*ARGSUSED*/
static int
sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
{
/* Threads cannot exit SDC once joined, except in a body bag. */
return (EPERM);
}
static void
sysdc_exit(kthread_t *t)
{
sysdc_t *sdc;
/* We're exiting, so we just rejoin the SYS class. */
thread_lock(t);
ASSERT(t->t_cid == sysdccid);
sdc = t->t_cldata;
t->t_cid = syscid;
t->t_cldata = NULL;
t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
(void) thread_change_pri(t, maxclsyspri, 0);
t->t_schedflag &= ~TS_RUNQMATCH;
thread_unlock_nopreempt(t);
/* Unlink the sdc from everything. */
sysdc_leave(sdc);
}
/*ARGSUSED*/
static int
sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
{
/*
* Threads cannot be created with SDC as their class; they must
* be created as SYS and then added with sysdc_thread_enter().
* Because of this restriction, sysdc_fork() should never be called.
*/
panic("sysdc cannot be forked");
return (ENOSYS);
}
/*ARGSUSED*/
static void
sysdc_forkret(kthread_t *t, kthread_t *ct)
{
/* SDC threads are part of system processes, which never fork. */
panic("sysdc cannot be forked");
}
static pri_t
sysdc_globpri(kthread_t *t)
{
return (t->t_epri);
}
/*ARGSUSED*/
static pri_t
sysdc_no_swap(kthread_t *t, int flags)
{
/* SDC threads cannot be swapped. */
return (-1);
}
/*
* Get maximum and minimum priorities enjoyed by SDC threads.
*/
static int
sysdc_getclpri(pcpri_t *pcprip)
{
pcprip->pc_clpmax = sysdc_maxpri;
pcprip->pc_clpmin = sysdc_minpri;
return (0);
}
/*ARGSUSED*/
static int
sysdc_getclinfo(void *arg)
{
return (0); /* no class-specific info */
}
/*ARGSUSED*/
static int
sysdc_alloc(void **p, int flag)
{
sysdc_t *new;
*p = NULL;
if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
return (ENOMEM);
}
if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
NULL) {
kmem_free(new, sizeof (*new));
return (ENOMEM);
}
*p = new;
return (0);
}
static void
sysdc_free(void *p)
{
sysdc_t *sdc = p;
if (sdc != NULL) {
/*
* We must have failed CL_ENTERCLASS(), so our pset should be
* there and unused.
*/
ASSERT(sdc->sdc_pset != NULL);
ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
kmem_free(sdc, sizeof (*sdc));
}
}
static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */
static int sysdc_einval();
static void sysdc_nullsys();
static struct classfuncs sysdc_classfuncs = {
/* messages to class manager */
{
sysdc_enosys, /* admin */
sysdc_getclinfo,
sysdc_enosys, /* parmsin */
sysdc_enosys, /* parmsout */
sysdc_enosys, /* vaparmsin */
sysdc_enosys, /* vaparmsout */
sysdc_getclpri,
sysdc_alloc,
sysdc_free,
},
/* operations on threads */
{
sysdc_enterclass,
sysdc_exitclass,
sysdc_canexit,
sysdc_fork,
sysdc_forkret,
sysdc_nullsys, /* parmsget */
sysdc_enosys, /* parmsset */
sysdc_nullsys, /* stop */
sysdc_exit,
sysdc_nullsys, /* active */
sysdc_nullsys, /* inactive */
sysdc_no_swap, /* swapin */
sysdc_no_swap, /* swapout */
sysdc_nullsys, /* trapret */
sysdc_preempt,
sysdc_setrun,
sysdc_sleep,
sysdc_tick,
sysdc_wakeup,
sysdc_einval, /* donice */
sysdc_globpri,
sysdc_nullsys, /* set_process_group */
sysdc_nullsys, /* yield */
sysdc_einval, /* doprio */
}
};
static int
sysdc_enosys()
{
return (ENOSYS);
}
static int
sysdc_einval()
{
return (EINVAL);
}
static void
sysdc_nullsys()
{
}
/*ARGSUSED*/
static pri_t
sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
{
int idx;
list_create(&sysdc_psets, sizeof (sysdc_pset_t),
offsetof(sysdc_pset_t, sdp_node));
for (idx = 0; idx < SYSDC_NLISTS; idx++) {
sysdc_active[idx].sdl_list = &sysdc_dummy;
}
sysdc_initparam();
sysdccid = cid;
*clfuncspp = &sysdc_classfuncs;
return ((pri_t)v.v_maxsyspri);
}
static struct sclass csw = {
"SDC",
sysdc_init,
0
};
static struct modlsched modlsched = {
&mod_schedops, "system duty cycle scheduling class", &csw
};
static struct modlinkage modlinkage = {
MODREV_1, (void *)&modlsched, NULL
};
int
_init()
{
return (mod_install(&modlinkage));
}
int
_fini()
{
return (EBUSY); /* can't unload for now */
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
/* --- consolidation-private interfaces --- */
void
sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
{
void *buf = NULL;
sysdc_params_t sdp;
SYSDC_INC_STAT(sysdc_thread_enter_enter);
ASSERT(sysdc_param_init);
ASSERT(sysdccid >= 0);
ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
sdp.sdp_minpri = sysdc_minpri;
sdp.sdp_maxpri = sysdc_maxpri;
sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
ASSERT(t->t_lwp != NULL);
ASSERT(t->t_cid == syscid);
ASSERT(t->t_cldata == NULL);
VERIFY0(CL_CANEXIT(t, NULL));
VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
CL_EXITCLASS(syscid, NULL);
}