disp.c revision 6890d023cce317bfcb74d7e43a813d060ebd2e47
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
#include <sys/sysmacros.h>
#include <sys/schedctl.h>
#include <sys/archsystm.h>
#define BOUND_CPU 0x1
#define BOUND_PARTITION 0x2
#define BOUND_INTR 0x4
/* Dispatch queue allocation structure and functions */
struct disp_queue_info {
int oldnglobpris;
};
/* platform-specific routine to call when processor is idle */
static void generic_idle_cpu();
void (*idle_cpu)() = generic_idle_cpu;
static void idle_enter();
static void idle_exit();
/* platform-specific routine to call when thread is enqueued */
static void generic_enq_thread(cpu_t *, int);
int nswapped; /* total number of swapped threads */
/*
* If this is set, only interrupt threads will cause kernel preemptions.
* This is done by changing the value of kpreemptpri. kpreemptpri
* will either be the max sysclass pri + 1 or the min interrupt pri.
*/
int only_intr_kpreempt;
extern void set_idle_cpu(int cpun);
extern void unset_idle_cpu(int cpun);
#define SETKP_BACK 0
#define SETKP_FRONT 1
/*
* Parameter that determines how recently a thread must have run
* on the CPU to be considered loosely-bound to that CPU to reduce
* cold cache effects. The interval is in hertz.
*/
#define RECHOOSE_INTERVAL 3
/*
* Parameter that determines how long (in nanoseconds) a thread must
* be sitting on a run queue before it can be stolen by another CPU
* to reduce migrations. The interval is in nanoseconds.
*
* The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
* to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
* here indicating it is uninitiallized.
* Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
*
*/
#define NOSTEAL_UNINITIALIZED (-1)
extern void cmp_set_nosteal_interval(void);
static void cpu_dispqalloc(int numpris);
/*
* This gets returned by disp_getwork/disp_getbest if we couldn't steal
* a thread because it was sitting on its run queue for a very short
* period of time.
*/
/*
* dispatcher and scheduler initialization
*/
/*
* disp_setup - Common code to calculate and allocate dispatcher
* variables and structures based on the maximum priority.
*/
static void
{
if (newnglobpris > oldnglobpris) {
/*
* Allocate new kp queues for each CPU partition.
*/
/*
* Allocate new dispatch queues for each CPU.
*/
/*
* compute new interrupt thread base priority
*/
if (only_intr_kpreempt) {
}
v.v_nglobpris = newnglobpris;
}
}
/*
* dispinit - Called to initialize all loaded classes and the
* dispatcher framework.
*/
void
dispinit(void)
{
maxglobpri = -1;
/*
* Initialize transition lock, which will always be set.
*/
/*
* Initialize the default CPU partition.
*/
/*
* Call the class specific initialization functions for
* all pre-installed schedulers.
*
* We pass the size of a class specific parameter
* buffer to each of the initialization functions
* to try to catch problems with backward compatibility
* of class modules.
*
* For example a new class module running on an old system
* which didn't provide sufficiently large parameter buffers
* would be bad news. Class initialization modules can check for
* this and take action if they detect a problem.
*/
if (SCHED_INSTALLED(sc)) {
if (cl_maxglobpri > maxglobpri)
}
}
ASSERT(maxglobpri >= 0);
disp_setup(maxglobpri, 0);
/*
* Platform specific sticky scheduler setup.
*/
if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
/*
* Get the default class ID; this may be later modified via
* dispadmin(1M). This will load the class (normally TS) and that will
* call disp_add(), which is why we had to drop cpu_lock first.
*/
}
}
/*
* disp_add - Called with class pointer to initialize the dispatcher
* for a newly loaded class.
*/
void
{
/*
* Initialize the scheduler class.
*/
if (cl_maxglobpri > maxglobpri)
/*
* Save old queue information. Since we're initializing a
* new scheduling class which has just been loaded, then
* the size of the dispq may have changed. We need to handle
* that here.
*/
}
/*
* For each CPU, allocate new dispatch queues
* with the stated number of priorities.
*/
static void
cpu_dispqalloc(int numpris)
{
struct disp_queue_info *disp_mem;
int i, num;
sizeof (struct disp_queue_info), KM_SLEEP);
/*
* This routine must allocate all of the memory before stopping
* the cpus because it must not sleep in kmem_alloc while the
* CPUs are stopped. Locks they hold will not be freed until they
* are restarted.
*/
i = 0;
do {
i++;
num = i;
for (i = 0; i < num; i++)
start_cpus();
/*
* I must free all of the memory after starting the cpus because
* I can not risk sleeping in kmem_free while the cpus are stopped.
*/
for (i = 0; i < num; i++)
disp_dq_free(&disp_mem[i]);
}
static void
{
sizeof (long), KM_SLEEP);
}
static void
{
/*
* Use kcopy because bcopy is platform-specific
* and could block while we might have paused the cpus.
*/
sizeof (long));
}
}
static void
{
}
/*
* For a newly created CPU, initialize the dispatch queue.
* This is called before the CPU is known through cpu[] or on any lists.
*/
void
{
else
/*
* Allocate memory for the dispatcher queue headers
* and the active queue bitmap.
*/
sizeof (long), KM_SLEEP);
}
void
{
}
/*
* Allocate new, larger kpreempt dispatch queue to replace the old one.
*/
void
{
struct disp_queue_info mem_info;
/*
* Allocate memory for the new array.
*/
/*
* We need to copy the old structures to the new
* and free the old.
*/
}
}
/*
* Free dispatch queue.
* Used for the kpreempt queues for a removed CPU partition and
* for the per-CPU queues of deleted CPUs.
*/
void
{
struct disp_queue_info mem_info;
}
/*
* End dispatcher and scheduler initialization.
*/
/*
* See if there's anything to do other than remain idle.
* Return non-zero if there is.
*
* This function must be called with high spl, or with
* kernel preemption disabled to prevent the partition's
* active cpu list from changing while being traversed.
*
* This is essentially a simpler version of disp_getwork()
* to be called by CPUs preparing to "halt".
*/
int
disp_anywork(void)
{
return (1);
/*
* Something has appeared on the local run queue.
*/
if (*local_nrunnable > 0)
return (1);
/*
* If we encounter another idle CPU that will
* soon be trolling around through disp_anywork()
* terminate our walk here and let this other CPU
* patrol the next part of the list.
*/
return (0);
/*
* Work can be taken from another CPU if:
* - There is unbound work on the run queue
* - That work isn't a thread undergoing a
* - context switch on an otherwise empty queue.
* - The CPU isn't running the idle loop.
*/
return (1);
}
}
return (0);
}
/*
* Called when CPU enters the idle loop
*/
static void
{
}
/*
* Called when CPU exits the idle loop
*/
static void
{
}
/*
* Idle loop.
*/
void
idle()
{
kthread_t *t; /* taken thread */
idle_enter();
/*
* Uniprocessor version of idle loop.
* Do this until notified that we're on an actual multiprocessor.
*/
while (ncpus == 1) {
(*idle_cpu)();
continue;
}
idle_exit();
swtch();
idle_enter(); /* returned from swtch */
}
/*
* Multiprocessor idle loop.
*/
for (;;) {
/*
* If CPU is completely quiesced by p_online(2), just wait
* here with minimal bus traffic until put online.
*/
(*idle_cpu)();
idle_exit();
swtch();
} else {
continue;
/*
* Set kpq under lock to prevent
* migration between partitions.
*/
}
(*idle_cpu)();
continue;
}
/*
* If there was a thread but we couldn't steal
* it, then keep trying.
*/
if (t == T_DONTSTEAL)
continue;
idle_exit();
swtch_to(t);
}
idle_enter(); /* returned from swtch/swtch_to */
}
}
/*
* Preempt the currently running thread in favor of the highest
* priority thread. The class of the current thread controls
* where it goes on the dispatcher queues. If panicking, turn
* preemption off.
*/
void
preempt()
{
if (panicstr)
return;
thread_lock(t);
/*
* this thread has already been chosen to be run on
* another CPU. Clear kprunrun on this CPU since we're
* already headed for swtch().
*/
CPU->cpu_kprunrun = 0;
} else {
CL_PREEMPT(t);
swtch(); /* clears CPU->cpu_runrun via disp() */
}
}
extern kthread_t *thread_unpin();
/*
* disp() - find the highest priority thread for this processor to run, and
* set it in TS_ONPROC state so that resume() can be called to run it.
*/
static kthread_t *
disp()
{
int maxrunword;
/*
* Find the highest priority loaded, runnable thread.
*/
/*
* If there is more important work on the global queue with a better
* priority than the maximum on this CPU, take it now.
*/
"disp_end:tid %p", tp);
return (tp);
}
}
/*
* If there is nothing to run, look at what's runnable on other queues.
* Choose the idle thread if the CPU is quiesced.
* Note that CPUs that have the CPU_OFFLINE flag set can still run
* interrupt threads, which will be the only threads on the CPU's own
* queue, but cannot run threads from other queues.
*/
if (pri == -1) {
tp == T_DONTSTEAL) {
(void) splhigh();
}
} else {
}
"disp_end:tid %p", tp);
return (tp);
}
/*
* Found it so remove it from queue.
*/
dp->disp_nrunnable--;
dq->dq_sruncnt--;
/*
* The queue is empty, so the corresponding bit needs to be
* turned off in dqactmap. If nrunnable != 0 just took the
* last runnable thread off the
* highest queue, so recompute disp_maxrunpri.
*/
if (dp->disp_nrunnable == 0) {
} else {
int ipri;
}
} else {
}
/*
* Set TS_DONT_SWAP flag to prevent another processor from swapping
* out this thread before we have a chance to run it.
* While running, it is protected against swapping by t_lock.
*/
"disp_end:tid %p", tp);
goto reschedule;
return (tp);
}
/*
* swtch()
* Find best runnable thread and run it.
* Called with the current thread already switched to a new state,
* on a sleep queue, run queue, stopped, and not zombied.
* May be called at any spl level less than or equal to LOCK_LEVEL.
* Always drops spl to the base level (spl0()).
*/
void
swtch()
{
if (t->t_flag & T_INTR_THREAD)
/*
* We are an interrupt thread. Setup and return
* the interrupted thread to be resumed.
*/
(void) splhigh(); /* block other scheduler action */
next = thread_unpin();
} else {
#ifdef DEBUG
t->t_preempt == 0) {
thread_lock(t);
t->t_preempt != 0); /* cannot migrate */
}
#endif /* DEBUG */
/* OK to steal anything left on run queue */
if (next != t) {
if (t == cp->cpu_idle_thread) {
}
/*
* If t was previously in the TS_ONPROC state,
* setfrontdq and setbackdq won't have set its t_waitrq.
* Since we now finally know that we're switching away
* from this thread, set its t_waitrq if it is on a run
* queue.
*/
t->t_waitrq = gethrtime_unscaled();
}
/*
* restore mstate of thread that we are switching to
*/
if (dtrace_vtime_active)
/*
* The TR_RESUME_END and TR_SWTCH_END trace points
* appear at the end of resume(), because we may not
* return here
*/
} else {
if (t->t_flag & T_INTR_THREAD)
(void) spl0();
}
}
}
/*
* swtch_from_zombie()
* Special case of swtch(), which allows checks for TS_ZOMB to be
* eliminated from normal resume.
* Find best runnable thread and run it.
* Called with the current thread zombied.
* Zombies cannot migrate, so CPU references are safe.
*/
void
{
if (dtrace_vtime_active)
/*
* The TR_RESUME_END and TR_SWTCH_END trace points
* appear at the end of resume(), because we certainly will not
* return here
*/
}
/*
* search_disp_queues()
* Search the given dispatch queues for thread tp.
* Return 1 if tp is found, otherwise return 0.
*/
static int
{
return (1);
}
}
return (0);
}
/*
* thread_on_queue()
* Search all per-CPU dispatch queues and all partition-wide kpreempt
* queues for thread tp. Return 1 if tp is found, otherwise return 0.
*/
static int
{
/*
* Search the per-CPU dispatch queues for tp.
*/
do {
return (1);
/*
* Search the partition-wide kpreempt queues for tp.
*/
do {
return (1);
return (0);
}
#else
#endif /* DEBUG */
/*
* like swtch(), but switch to a specified thread taken from another CPU.
* called with spl high..
*/
void
{
/*
* Update context switch statistics.
*/
/* OK to steal anything left on run queue */
/* record last execution time */
/*
* If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
* won't have set its t_waitrq. Since we now finally know that we're
* switching away from this thread, set its t_waitrq if it is on a run
* queue.
*/
}
/* restore next thread to previously running microstate */
if (dtrace_vtime_active)
/*
* The TR_RESUME_END and TR_SWTCH_END trace points
* appear at the end of resume(), because we may not
* return here
*/
}
static void
{
int call_poke_cpu = 0;
call_poke_cpu = 1;
}
call_poke_cpu = 1;
}
}
/*
* Propagate cpu_runrun, and cpu_kprunrun to global visibility.
*/
membar_enter();
if (call_poke_cpu)
}
/*
* setbackdq() keeps runqs balanced such that the difference in length
* between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
* For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
* must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
* try to keep runqs perfectly balanced regardless of the thread priority.
*/
/*
* Macro that evaluates to true if it is likely that the thread has cache
* warmth. This is based on the amount of time that has elapsed since the
* thread last ran. If that amount of time is less than "rechoose_interval"
* ticks, then we decide that the thread has enough cache warmth to warrant
* some affinity for t->t_cpu.
*/
#define THREAD_HAS_CACHE_WARMTH(thread) \
/*
* Put the specified thread on the back of the dispatcher
* queue corresponding to its current priority.
*
* Called with the thread in transition, onproc or stopped state
* and locked (transition implies locked) and at high spl.
* Returns with the thread in TS_RUN state and still locked.
*/
void
{
int bound;
/*
* If thread is "swapped" or on the swap queue don't
* queue it, but wake sched.
*/
return;
}
bound = 1;
else
bound = 0;
if (ncpus == 1)
else if (!bound) {
return;
}
/*
* We'll generally let this thread continue to run where
* it last ran...but will consider migration if:
* - We thread probably doesn't have much cache warmth.
* - The CPU where it last ran is the target of an offline
* request.
* - The thread last ran outside it's home lgroup.
*/
if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
} else {
}
int qlen;
/*
* Perform any CMT load balancing
*/
/*
* Balance across the run queues
*/
if (tpri >= RUNQ_MATCH_PRI &&
qlen -= RUNQ_MAX_DIFF;
if (qlen > 0) {
}
}
}
} else {
/*
* Migrate to a cpu in the new partition.
*/
}
} else {
/*
* It is possible that t_weakbound_cpu != t_bound_cpu (for
* a short time until weak binding that existed when the
* strong binding was established has dropped) so we must
* favour weak binding over strong.
*/
}
/*
* A thread that is ONPROC may be temporarily placed on the run queue
* but then chosen to run again by disp. If the thread we're placing on
* the queue is in TS_ONPROC state, don't set its t_waitrq until a
* replacement process is actually scheduled in swtch(). In this
* situation, curthread is the only thread that could be in the ONPROC
* state.
*/
} else {
}
#ifndef NPROBE
/* Kernel probe */
if (tnf_tracing_active)
#endif /* NPROBE */
dp->disp_nrunnable++;
if (!bound)
dp->disp_steal = 0;
membar_enter();
if (dq->dq_sruncnt++ != 0) {
} else {
membar_enter();
}
}
/*
* If there are no other unbound threads on the
* run queue, don't allow other CPUs to steal
* this thread while we are in the middle of a
* context switch. We may just switch to it
* again right away. CPU_DISP_DONTSTEAL is cleared
* in swtch and swtch_to.
*/
}
}
}
/*
* Put the specified thread on the front of the dispatcher
* queue corresponding to its current priority.
*
* Called with the thread in transition, onproc or stopped state
* and locked (transition implies locked) and at high spl.
* Returns with the thread in TS_RUN state and still locked.
*/
void
{
int bound;
/*
* If thread is "swapped" or on the swap queue don't
* queue it, but wake sched.
*/
return;
}
bound = 1;
else
bound = 0;
if (ncpus == 1)
else if (!bound) {
return;
}
/*
* We'll generally let this thread continue to run
* where it last ran, but will consider migration if:
* - The thread last ran outside it's home lgroup.
* - The CPU where it last ran is the target of an
* offline request (a thread_nomigrate() on the in
* motion CPU relies on this when forcing a preempt).
* - The thread isn't the highest priority thread where
* it last ran, and it is considered not likely to
* have significant cache warmth.
*/
(cp == cpu_inmotion)) {
(!THREAD_HAS_CACHE_WARMTH(tp))) {
NULL);
}
} else {
/*
* Migrate to a cpu in the new partition.
*/
}
} else {
/*
* It is possible that t_weakbound_cpu != t_bound_cpu (for
* a short time until weak binding that existed when the
* strong binding was established has dropped) so we must
* favour weak binding over strong.
*/
}
/*
* A thread that is ONPROC may be temporarily placed on the run queue
* but then chosen to run again by disp. If the thread we're placing on
* the queue is in TS_ONPROC state, don't set its t_waitrq until a
* replacement process is actually scheduled in swtch(). In this
* situation, curthread is the only thread that could be in the ONPROC
* state.
*/
} else {
}
#ifndef NPROBE
/* Kernel probe */
if (tnf_tracing_active)
#endif /* NPROBE */
dp->disp_nrunnable++;
if (!bound)
dp->disp_steal = 0;
membar_enter();
if (dq->dq_sruncnt++ != 0) {
} else {
membar_enter();
}
}
/*
* If there are no other unbound threads on the
* run queue, don't allow other CPUs to steal
* this thread while we are in the middle of a
* context switch. We may just switch to it
* again right away. CPU_DISP_DONTSTEAL is cleared
* in swtch and swtch_to.
*/
}
}
}
/*
* Put a high-priority unbound thread on the kp queue
*/
static void
{
dp->disp_nrunnable++;
if (dq->dq_sruncnt++ != 0) {
if (borf == SETKP_BACK) {
} else {
}
} else {
if (borf == SETKP_BACK) {
} else {
}
membar_enter();
}
}
/* migrate to a cpu in the new partition */
}
#ifndef NPROBE
/* Kernel probe */
if (tnf_tracing_active)
#endif /* NPROBE */
(*disp_enq_thread)(cp, 0);
}
/*
* Remove a thread from the dispatcher queue if it is on it.
* It is not an error if it is not found but we return whether
* or not it was found in case the caller wants to check.
*/
int
{
int tpri;
return (0);
/*
* The thread is "swapped" or is on the swap queue and
* hence no longer on the run queue, so return true.
*/
return (1);
/*
* Search for thread in queue.
*/
}
panic("dispdeq: thread not on queue");
}
/*
* Found it so remove it from queue.
*/
dp->disp_nrunnable--;
if (--dq->dq_sruncnt == 0) {
if (dp->disp_nrunnable == 0) {
int ipri;
}
}
return (1);
}
/*
* dq_sruninc and dq_srundec are public functions for
* incrementing/decrementing the sruncnts when a thread on
* a dispatcher queue is made schedulable/unschedulable by
* resetting the TS_LOAD flag.
*
* The caller MUST have the thread lock and therefore the dispatcher
* queue lock so that the operation which changes
* the flag, the operation that checks the status of the thread to
* determine if it's on a disp queue AND the call to this function
* are one atomic operation with respect to interrupts.
*/
/*
* Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
*/
void
dq_sruninc(kthread_t *t)
{
setfrontdq(t);
}
/*
* See comment on calling conventions above.
* Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
*/
void
dq_srundec(kthread_t *t)
{
(void) dispdeq(t);
disp_swapped_enq(t);
}
/*
* Change the dispatcher lock of thread to the "swapped_lock"
* and return with thread lock still held.
*
* Called with thread_lock held, in transition state, and at high spl.
*/
void
{
case TS_RUN:
break;
case TS_ONPROC:
break;
default:
}
}
/*
* This routine is called by setbackdq/setfrontdq if the thread is
* not loaded or loaded and on the swap queue.
*
* Thread state TS_SLEEP implies that a swapped thread
* has been woken up and needs to be swapped in by the swapper.
*
* Thread state TS_RUN, it implies that the priority of a swapped
* thread is being increased by scheduling class (e.g. ts_update).
*/
static void
{
case TS_SLEEP:
/*
* Wakeup sched immediately (i.e., next tick) if the
* thread priority is above maxclsyspri.
*/
wake_sched = 1;
else
wake_sched_sec = 1;
break;
case TS_RUN: /* called from ts_update */
break;
default:
}
}
/*
* Make a thread give up its processor. Find the processor on
* which this thread is executing, and have that processor
* preempt.
*/
void
{
int max_pri;
int max_run_pri;
return;
if (max_pri < max_run_pri)
}
/*
* Propagate cpu_runrun, and cpu_kprunrun to global visibility.
*/
membar_enter();
/*
* Make the target thread take an excursion through trap()
* to do preempt() (unless we're already in trap or post_syscall,
* calling cpu_surrender via CL_TRAPRET).
*/
}
}
/*
* Commit to and ratify a scheduling decision
*/
/*ARGSUSED*/
static kthread_t *
{
/*
* Commit to, then ratify scheduling decision
*/
if (cpup->cpu_runrun != 0)
cpup->cpu_runrun = 0;
if (cpup->cpu_kprunrun != 0)
cpup->cpu_kprunrun = 0;
membar_enter();
/*
* should have done better
* put this one back and indicate to try again
*/
setfrontdq(tp);
}
return (tp);
}
/*
* See if there is any work on the dispatcher queue for other CPUs.
* If there is, dequeue the best thread and return.
*/
static kthread_t *
{
maxpri = -1;
while (kpq->disp_maxrunpri >= 0) {
/*
* Try to take a thread from the kp_queue.
*/
if (tp)
}
kpreempt_disable(); /* protect the cpu_active list */
/*
* Try to find something to do on another CPU's run queue.
* Loop through all other CPUs looking for the one with the highest
* priority unbound thread.
*
* On NUMA machines, the partition's CPUs are consulted in order of
* distance from the current CPU. This way, the first available
* work found is also the closest, and will suffer the least
* from being migrated.
*/
/*
* This loop traverses the lpl hierarchy. Higher level lpls represent
* broader levels of locality
*/
do {
/* This loop iterates over the lpl's leaves */
do {
else
/* This loop iterates over the CPUs in the leaf */
do {
/*
* End our stroll around this lpl if:
*
* - Something became runnable on the local
* queue...which also ends our stroll around
* the partition.
*
* - We happen across another idle CPU.
* Since it is patrolling the next portion
* of the lpl's list (assuming it's not
* halted, or busy servicing an interrupt),
* move to the next higher level of locality.
*/
return (NULL);
}
if (ocp->cpu_disp_flags &
ocp->cpu_intr_actv != 0)
continue;
else
goto next_level;
}
/*
* If there's only one thread and the CPU
* is in the middle of a context switch,
* or it's currently running the idle thread,
* don't steal it.
*/
if ((ocp->cpu_disp_flags &
continue;
/*
* Don't steal threads that we attempted
* to steal recently until they're ready
* to be stolen again.
*/
if (stealtime == 0 ||
} else {
/*
* Don't update tcp, just set
* the retval to T_DONTSTEAL, so
* that if no acceptable CPUs
* are found the return value
* will be T_DONTSTEAL rather
* then NULL.
*/
}
}
/*
* Iterate to the next leaf lpl in the resource set
* at this level of locality. If we hit the end of
* the set, wrap back around to the beginning.
*
* Note: This iteration is NULL terminated for a reason
* see lpl_topo_bootstrap() in lgrp.c for details.
*/
leafidx = 0;
}
/*
* Expand the search to include farther away CPUs (next
* locality level). The closer CPUs that have already been
* checked will be checked again. In doing so, idle CPUs
* will tend to be more aggresive about stealing from CPUs
* that are closer (since the closer CPUs will be considered
* more often).
* Begin at this level with the CPUs local leaf lpl.
*/
}
/*
* If another queue looks good, and there is still nothing on
* the local queue, try to transfer one or more threads
* from it to our queue.
*/
return (tp);
}
return (retval);
}
/*
* disp_fix_unbound_pri()
* Determines the maximum priority of unbound threads on the queue.
* The priority is kept for the queue, but is only increased, never
* reduced unless some CPU is looking for something on that queue.
*
* The priority argument is the known upper limit.
*
* Perhaps this should be kept accurately, but that probably means
* separate bitmaps for bound and unbound threads. Since only idled
* CPUs will have to do this recalculation, it seems better this way.
*/
static void
{
int wx;
/*
* Start the search at the next lowest priority below the supplied
* priority. This depends on the bitmap implementation.
*/
do {
/*
* Form mask for all lower priorities in the word.
*/
/*
* Get next lower active priority.
*/
if (mapword != 0) {
} else if (wx > 0) {
if (pri < 0)
break;
} else {
pri = -1;
break;
}
/*
* Search the queue for unbound, runnable threads.
*/
}
/*
* If a thread was found, set the priority and return.
*/
/*
* pri holds the maximum unbound thread priority or -1.
*/
}
/*
* disp_adjust_unbound_pri() - thread is becoming unbound, so we should
* check if the CPU to which is was previously bound should have
* its disp_max_unbound_pri increased.
*/
void
{
/*
* Don't do anything if the thread is not bound, or
* currently not runnable or swapped out.
*/
return;
}
/*
* disp_getbest()
* De-queue the highest priority unbound runnable thread.
* Returns with the thread unlocked and onproc but at splhigh (like disp()).
* Returns NULL if nothing found.
* Returns T_DONTSTEAL if the thread was not stealable.
* so that the caller will try again later.
*
* Passed a pointer to a dispatch queue not associated with this CPU, and
* its type.
*/
static kthread_t *
{
/*
* If there is nothing to run, or the CPU is in the middle of a
* context switch of the only thread, return NULL.
*/
if (pri == -1 ||
return (NULL);
}
/*
* Assume that all threads are bound on this queue, and change it
* later when we find out that it is not the case.
*/
/*
* Skip over bound threads which could be here even
* though disp_max_unbound_pri indicated this level.
*/
continue;
/*
* We've got some unbound threads on this queue, so turn
* the allbound flag off now.
*/
/*
* The thread is a candidate for stealing from its run queue. We
* don't want to steal threads that became runnable just a
* moment ago. This improves CPU affinity for threads that get
* preempted for short periods of time and go back on the run
* queue.
*
* We want to let it stay on its run queue if it was only placed
* there recently and it was running on the same CPU before that
* to preserve its cache investment. For the thread to remain on
* its run queue, ALL of the following conditions must be
* satisfied:
*
* - the disp queue should not be the kernel preemption queue
* - delayed idle stealing should not be disabled
* - nosteal_nsec should be non-zero
* - it should run with user priority
* - it should be on the run queue of the CPU where it was
* running before being placed on the run queue
* - it should be the only thread on the run queue (to prevent
* extra scheduling latency for other threads)
* - it should sit on the run queue for less than per-chip
* nosteal interval or global nosteal interval
* - in case of CPUs with shared cache it should sit in a run
* queue of a CPU from a different chip
*
* The checks are arranged so that the ones that are faster are
* placed earlier.
*/
pri >= minclsyspri ||
break;
/*
* Steal immediately if, due to CMT processor architecture
* migraiton between cp and tcp would incur no performance
* penalty.
*/
break;
if (nosteal == 0)
break;
/*
* Calculate time spent sitting on run queue
*/
now = gethrtime_unscaled();
/*
* Steal immediately if the time spent on this run queue is more
* than allowed nosteal delay.
*
* Negative rqtime check is needed here to avoid infinite
* stealing delays caused by unlikely but not impossible
* drifts between CPU times on different CPUs.
*/
break;
scalehrtime(&now);
/*
* Calculate when this thread becomes stealable
*/
/*
* Calculate time when some thread becomes stealable
*/
}
/*
* If there were no unbound threads on this queue, find the queue
* where they are and then return later. The value of
* disp_max_unbound_pri is not always accurate because it isn't
* reduced until another idle CPU looks for work.
*/
if (allbound)
/*
* If we reached the end of the queue and found no unbound threads
* then return NULL so that other CPUs will be considered. If there
* are unbound threads but they cannot yet be stolen, then
* return T_DONTSTEAL and try again later.
*/
}
/*
* Found a runnable, unbound thread, so remove it from queue.
* dispdeq() requires that we have the thread locked, and we do,
* by virtue of holding the dispatch queue lock. dispdeq() will
* put the thread in transition state, thereby dropping the dispq
* lock.
*/
#ifdef DEBUG
{
int thread_was_on_queue;
}
#else /* DEBUG */
#endif /* DEBUG */
/*
* Reset the disp_queue steal time - we do not know what is the smallest
* value across the queue is.
*/
dp->disp_steal = 0;
/*
* Setup thread to run on the current CPU.
*/
/*
* There can be a memory synchronization race between disp_getbest()
* and disp_ratify() vs cpu_resched() where cpu_resched() is trying
* to preempt the current thread to run the enqueued thread while
* disp_getbest() and disp_ratify() are changing the current thread
* to the stolen thread. This may lead to a situation where
* cpu_resched() tries to preempt the wrong thread and the
* stolen thread continues to run on the CPU which has been tagged
* for preemption.
* Later the clock thread gets enqueued but doesn't get to run on the
* CPU causing the system to hang.
*
* To avoid this, grabbing and dropping the disp_lock (which does
* a memory barrier) is needed to synchronize the execution of
* cpu_resched() with disp_getbest() and disp_ratify() and
* synchronize the memory read and written by cpu_resched(),
* disp_getbest(), and disp_ratify() with each other.
* (see CR#6482861 for more details).
*/
/*
* Return with spl high so that swtch() won't need to raise it.
* The disp_lock was dropped by dispdeq().
*/
return (tp);
}
/*
* disp_bound_common() - common routine for higher level functions
* that check for bound threads under certain conditions.
* If 'threadlistsafe' is set then there is no need to acquire
* pidlock to stop the thread list from changing (eg, if
* disp_bound_* is called with cpus paused).
*/
static int
{
int found = 0;
if (!threadlistsafe)
do {
/*
* If an interrupt thread is busy, but the
* caller doesn't care (i.e. BOUND_INTR is off),
* then just ignore it and continue through.
*/
!(flag & BOUND_INTR))
continue;
/*
* Skip the idle thread for the CPU
* we're about to set offline.
*/
continue;
/*
* Skip the pause thread for the CPU
* we're about to set offline.
*/
continue;
found = 1;
break;
}
if ((flag & BOUND_PARTITION) &&
found = 1;
break;
}
}
if (!threadlistsafe)
return (found);
}
/*
* disp_bound_threads - return nonzero if threads are bound to the processor.
* Called infrequently. Keep this simple.
* Includes threads that are asleep or stopped but not onproc.
*/
int
{
}
/*
* disp_bound_anythreads - return nonzero if _any_ threads are bound
* to the given processor, including interrupt threads.
*/
int
{
}
/*
* disp_bound_partition - return nonzero if threads are bound to the same
* partition as the processor.
* Called infrequently. Keep this simple.
* Includes threads that are asleep or stopped but not onproc.
*/
int
{
}
/*
* disp_cpu_inactive - make a CPU inactive by moving all of its unbound
* threads to other CPUs.
*/
void
{
int wasonq;
/*
* Skip over bound threads.
*/
}
/* disp_max_unbound_pri must be inaccurate, so fix it */
continue;
}
/*
* Called from cpu_offline:
*
* cp has already been removed from the list of active cpus
* and tp->t_cpu has been changed so there is no risk of
* tp ending up back on cp.
*
* Called from cpupart_move_cpu:
*
* The cpu has moved to a new cpupart. Any threads that
* were on it's dispatch queues before the move remain
* in the old partition and can't run in the new partition.
*/
}
}
/*
* disp_lowpri_cpu - find CPU running the lowest priority thread.
* The hint passed in is used as a starting point so we don't favor
* CPU 0 or any other CPU. The caller should pass in the most recently
* used CPU for the thread.
*
* The lgroup and priority are used to determine the best CPU to run on
* in a NUMA machine. The lgroup specifies which CPUs are closest while
* the thread priority will indicate whether the thread will actually run
* there. To pick the best CPU, the CPUs inside and outside of the given
* lgroup which are running the lowest priority threads are found. The
* remote CPU is chosen only if the thread will not run locally on a CPU
* within the lgroup, but will run on the remote CPU. If the thread
* cannot immediately run on any CPU, the best local CPU will be chosen.
*
* The lpl specified also identifies the cpu partition from which
* disp_lowpri_cpu should select a CPU.
*
* curcpu is used to indicate that disp_lowpri_cpu is being called on
* behalf of the current thread. (curthread is looking for a new cpu)
* In this case, cpu_dispatch_pri for this thread's cpu should be
* ignored.
*
* If a cpu is the target of an offline request then try to avoid it.
*
* This function must be called at either high SPL, or with preemption
* disabled, so that the "hint" CPU cannot be removed from the online
* CPU list while we are traversing it.
*/
cpu_t *
{
int i;
/*
* Scan for a CPU currently running the lowest priority thread.
* Cannot get cpu_lock here because it is adaptive.
* We do not require lock on CPU list.
*/
/*
* First examine local CPUs. Note that it's possible the hint CPU
* passed in in remote to the specified home lgroup. If our priority
* isn't sufficient enough such that we can run immediately at home,
* then examine CPUs remote to our home lgroup.
* We would like to give preference to CPUs closest to "home".
* If we can't find a CPU where we'll run at a given level
* of locality, we expand our search to include the next level.
*/
/* start with lpl we were passed */
do {
continue;
else
do {
cpupri = -1;
else if (cp == cpu_inmotion)
else
if (CPU_IDLING(cpupri)) {
CPU_QUIESCED) == 0);
return (cp);
}
}
}
return (bestcpu);
}
if (besthomecpu == NULL)
/*
* Add the lgrps we just considered to the "done" set
*/
/*
* The specified priority isn't high enough to run immediately
* anywhere, so just return the best CPU from the home lgroup.
*/
return (besthomecpu);
}
/*
* This routine provides the generic idle cpu function for all processors.
* If a processor has some specific code to execute when idle (say, to stop
* the pipeline and save power) then that routine should be defined in the
* processors specific code (module_xx.c) and the global variable idle_cpu
* set to that function.
*/
static void
generic_idle_cpu(void)
{
}
/*ARGSUSED*/
static void
{
}