common/disp/disp.c

	disp.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*  Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*    All Rights Reserved   */


#pragma ident   "%Z%%M% %I% %E% SMI"    /* from SVr4.0 1.30 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/signal.h>
#include <sys/user.h>
#include <sys/systm.h>
#include <sys/sysinfo.h>
#include <sys/var.h>
#include <sys/errno.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/inline.h>
#include <sys/disp.h>
#include <sys/class.h>
#include <sys/bitmap.h>
#include <sys/kmem.h>
#include <sys/cpuvar.h>
#include <sys/vtrace.h>
#include <sys/tnf.h>
#include <sys/cpupart.h>
#include <sys/lgrp.h>
#include <sys/chip.h>
#include <sys/schedctl.h>
#include <sys/atomic.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>

#include <vm/as.h>

#define BOUND_CPU   0x1
#define BOUND_PARTITION 0x2
#define BOUND_INTR  0x4

/* Dispatch queue allocation structure and functions */
struct disp_queue_info {
    disp_t  *dp;
    dispq_t *olddispq;
    dispq_t *newdispq;
    ulong_t *olddqactmap;
    ulong_t *newdqactmap;
    int oldnglobpris;
};
static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
    disp_t *dp);
static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
static void disp_dq_free(struct disp_queue_info *dptr);

/* platform-specific routine to call when processor is idle */
static void generic_idle_cpu();
void        (*idle_cpu)() = generic_idle_cpu;

/* routines invoked when a CPU enters/exits the idle loop */
static void idle_enter();
static void idle_exit();

/* platform-specific routine to call when thread is enqueued */
static void generic_enq_thread(cpu_t *, int);
void        (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;

pri_t   kpreemptpri;    /* priority where kernel preemption applies */
pri_t   upreemptpri = 0; /* priority where normal preemption applies */
pri_t   intr_pri;   /* interrupt thread priority base level */

#define KPQPRI  -1 /* priority where cpu affinity is dropped for kp queue */
pri_t   kpqpri = KPQPRI; /* can be set in /etc/system */
disp_t  cpu0_disp;  /* boot CPU's dispatch queue */
disp_lock_t swapped_lock;   /* lock swapped threads and swap queue */
int nswapped;   /* total number of swapped threads */
void    disp_swapped_enq(kthread_t *tp);
static void disp_swapped_setrun(kthread_t *tp);
static void cpu_resched(cpu_t *cp, pri_t tpri);

/*
 * If this is set, only interrupt threads will cause kernel preemptions.
 * This is done by changing the value of kpreemptpri.  kpreemptpri
 * will either be the max sysclass pri + 1 or the min interrupt pri.
 */
int only_intr_kpreempt;

extern void set_idle_cpu(int cpun);
extern void unset_idle_cpu(int cpun);
static void setkpdq(kthread_t *tp, int borf);
#define SETKP_BACK  0
#define SETKP_FRONT 1
/*
 * Parameter that determines how recently a thread must have run
 * on the CPU to be considered loosely-bound to that CPU to reduce
 * cold cache effects.  The interval is in hertz.
 *
 * The platform may define a per physical processor adjustment of
 * this parameter. For efficiency, the effective rechoose interval
 * (rechoose_interval + per chip adjustment) is maintained in the
 * cpu structures. See cpu_choose()
 */
int rechoose_interval = RECHOOSE_INTERVAL;

static cpu_t    *cpu_choose(kthread_t *, pri_t);

id_t    defaultcid; /* system "default" class; see dispadmin(1M) */

disp_lock_t transition_lock;    /* lock on transitioning threads */
disp_lock_t stop_lock;      /* lock on stopped threads */
disp_lock_t shuttle_lock;       /* lock on shuttle objects */

static void     cpu_dispqalloc(int numpris);

static kthread_t    *disp_getwork(cpu_t *to);
static kthread_t    *disp_getbest(disp_t *from);
static kthread_t    *disp_ratify(kthread_t *tp, disp_t *kpq);

void    swtch_to(kthread_t *);

/*
 * dispatcher and scheduler initialization
 */

/*
 * disp_setup - Common code to calculate and allocate dispatcher
 *      variables and structures based on the maximum priority.
 */
static void
disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
{
    pri_t   newnglobpris;

    ASSERT(MUTEX_HELD(&cpu_lock));

    newnglobpris = maxglobpri + 1 + LOCK_LEVEL;

    if (newnglobpris > oldnglobpris) {
        /*
         * Allocate new kp queues for each CPU partition.
         */
        cpupart_kpqalloc(newnglobpris);

        /*
         * Allocate new dispatch queues for each CPU.
         */
        cpu_dispqalloc(newnglobpris);

        /*
         * compute new interrupt thread base priority
         */
        intr_pri = maxglobpri;
        if (only_intr_kpreempt) {
            kpreemptpri = intr_pri + 1;
            if (kpqpri == KPQPRI)
                kpqpri = kpreemptpri;
        }
        v.v_nglobpris = newnglobpris;
    }
}

/*
 * dispinit - Called to initialize all loaded classes and the
 *        dispatcher framework.
 */
void
dispinit(void)
{
    id_t    cid;
    pri_t   maxglobpri;
    pri_t   cl_maxglobpri;

    maxglobpri = -1;

    /*
     * Initialize transition lock, which will always be set.
     */
    DISP_LOCK_INIT(&transition_lock);
    disp_lock_enter_high(&transition_lock);
    DISP_LOCK_INIT(&stop_lock);
    DISP_LOCK_INIT(&shuttle_lock);

    mutex_enter(&cpu_lock);
    CPU->cpu_disp->disp_maxrunpri = -1;
    CPU->cpu_disp->disp_max_unbound_pri = -1;
    /*
     * Initialize the default CPU partition.
     */
    cpupart_initialize_default();
    /*
     * Call the class specific initialization functions for
     * all pre-installed schedulers.
     *
     * We pass the size of a class specific parameter
     * buffer to each of the initialization functions
     * to try to catch problems with backward compatibility
     * of class modules.
     *
     * For example a new class module running on an old system
     * which didn't provide sufficiently large parameter buffers
     * would be bad news. Class initialization modules can check for
     * this and take action if they detect a problem.
     */

    for (cid = 0; cid < nclass; cid++) {
        sclass_t    *sc;

        sc = &sclass[cid];
        if (SCHED_INSTALLED(sc)) {
            cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
                &sc->cl_funcs);
            if (cl_maxglobpri > maxglobpri)
                maxglobpri = cl_maxglobpri;
        }
    }
    kpreemptpri = (pri_t)v.v_maxsyspri + 1;
    if (kpqpri == KPQPRI)
        kpqpri = kpreemptpri;

    ASSERT(maxglobpri >= 0);
    disp_setup(maxglobpri, 0);

    mutex_exit(&cpu_lock);

    /*
     * Get the default class ID; this may be later modified via
     * dispadmin(1M).  This will load the class (normally TS) and that will
     * call disp_add(), which is why we had to drop cpu_lock first.
     */
    if (getcid(defaultclass, &defaultcid) != 0) {
        cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
            defaultclass);
    }
}

/*
 * disp_add - Called with class pointer to initialize the dispatcher
 *        for a newly loaded class.
 */
void
disp_add(sclass_t *clp)
{
    pri_t   maxglobpri;
    pri_t   cl_maxglobpri;

    mutex_enter(&cpu_lock);
    /*
     * Initialize the scheduler class.
     */
    maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
    cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
    if (cl_maxglobpri > maxglobpri)
        maxglobpri = cl_maxglobpri;

    /*
     * Save old queue information.  Since we're initializing a
     * new scheduling class which has just been loaded, then
     * the size of the dispq may have changed.  We need to handle
     * that here.
     */
    disp_setup(maxglobpri, v.v_nglobpris);

    mutex_exit(&cpu_lock);
}


/*
 * For each CPU, allocate new dispatch queues
 * with the stated number of priorities.
 */
static void
cpu_dispqalloc(int numpris)
{
    cpu_t   *cpup;
    struct disp_queue_info  *disp_mem;
    int i, num;

    ASSERT(MUTEX_HELD(&cpu_lock));

    disp_mem = kmem_zalloc(NCPU *
        sizeof (struct disp_queue_info), KM_SLEEP);

    /*
     * This routine must allocate all of the memory before stopping
     * the cpus because it must not sleep in kmem_alloc while the
     * CPUs are stopped.  Locks they hold will not be freed until they
     * are restarted.
     */
    i = 0;
    cpup = cpu_list;
    do {
        disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
        i++;
        cpup = cpup->cpu_next;
    } while (cpup != cpu_list);
    num = i;

    pause_cpus(NULL);
    for (i = 0; i < num; i++)
        disp_dq_assign(&disp_mem[i], numpris);
    start_cpus();

    /*
     * I must free all of the memory after starting the cpus because
     * I can not risk sleeping in kmem_free while the cpus are stopped.
     */
    for (i = 0; i < num; i++)
        disp_dq_free(&disp_mem[i]);

    kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
}

static void
disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
{
    dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
    dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
        sizeof (long), KM_SLEEP);
    dptr->dp = dp;
}

static void
disp_dq_assign(struct disp_queue_info *dptr, int numpris)
{
    disp_t  *dp;

    dp = dptr->dp;
    dptr->olddispq = dp->disp_q;
    dptr->olddqactmap = dp->disp_qactmap;
    dptr->oldnglobpris = dp->disp_npri;

    ASSERT(dptr->oldnglobpris < numpris);

    if (dptr->olddispq != NULL) {
        /*
         * Use kcopy because bcopy is platform-specific
         * and could block while we might have paused the cpus.
         */
        (void) kcopy(dptr->olddispq, dptr->newdispq,
            dptr->oldnglobpris * sizeof (dispq_t));
        (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
            ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
            sizeof (long));
    }
    dp->disp_q = dptr->newdispq;
    dp->disp_qactmap = dptr->newdqactmap;
    dp->disp_q_limit = &dptr->newdispq[numpris];
    dp->disp_npri = numpris;
}

static void
disp_dq_free(struct disp_queue_info *dptr)
{
    if (dptr->olddispq != NULL)
        kmem_free(dptr->olddispq,
            dptr->oldnglobpris * sizeof (dispq_t));
    if (dptr->olddqactmap != NULL)
        kmem_free(dptr->olddqactmap,
            ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
}

/*
 * For a newly created CPU, initialize the dispatch queue.
 * This is called before the CPU is known through cpu[] or on any lists.
 */
void
disp_cpu_init(cpu_t *cp)
{
    disp_t  *dp;
    dispq_t *newdispq;
    ulong_t *newdqactmap;

    ASSERT(MUTEX_HELD(&cpu_lock));  /* protect dispatcher queue sizes */

    if (cp == cpu0_disp.disp_cpu)
        dp = &cpu0_disp;
    else
        dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
    bzero(dp, sizeof (disp_t));
    cp->cpu_disp = dp;
    dp->disp_cpu = cp;
    dp->disp_maxrunpri = -1;
    dp->disp_max_unbound_pri = -1;
    DISP_LOCK_INIT(&cp->cpu_thread_lock);
    /*
     * Allocate memory for the dispatcher queue headers
     * and the active queue bitmap.
     */
    newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
    newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
        sizeof (long), KM_SLEEP);
    dp->disp_q = newdispq;
    dp->disp_qactmap = newdqactmap;
    dp->disp_q_limit = &newdispq[v.v_nglobpris];
    dp->disp_npri = v.v_nglobpris;
}

void
disp_cpu_fini(cpu_t *cp)
{
    ASSERT(MUTEX_HELD(&cpu_lock));

    disp_kp_free(cp->cpu_disp);
    if (cp->cpu_disp != &cpu0_disp)
        kmem_free(cp->cpu_disp, sizeof (disp_t));
}

/*
 * Allocate new, larger kpreempt dispatch queue to replace the old one.
 */
void
disp_kp_alloc(disp_t *dq, pri_t npri)
{
    struct disp_queue_info  mem_info;

    if (npri > dq->disp_npri) {
        /*
         * Allocate memory for the new array.
         */
        disp_dq_alloc(&mem_info, npri, dq);

        /*
         * We need to copy the old structures to the new
         * and free the old.
         */
        disp_dq_assign(&mem_info, npri);
        disp_dq_free(&mem_info);
    }
}

/*
 * Free dispatch queue.
 * Used for the kpreempt queues for a removed CPU partition and
 * for the per-CPU queues of deleted CPUs.
 */
void
disp_kp_free(disp_t *dq)
{
    struct disp_queue_info  mem_info;

    mem_info.olddispq = dq->disp_q;
    mem_info.olddqactmap = dq->disp_qactmap;
    mem_info.oldnglobpris = dq->disp_npri;
    disp_dq_free(&mem_info);
}

/*
 * End dispatcher and scheduler initialization.
 */

/*
 * See if there's anything to do other than remain idle.
 * Return non-zero if there is.
 *
 * This function must be called with high spl, or with
 * kernel preemption disabled to prevent the partition's
 * active cpu list from changing while being traversed.
 *
 */
int
disp_anywork(void)
{
    cpu_t   *cp = CPU;
    cpu_t   *ocp;

    if (cp->cpu_disp->disp_nrunnable != 0)
        return (1);

    if (!(cp->cpu_flags & CPU_OFFLINE)) {
        if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
            return (1);

        /*
         * Work can be taken from another CPU if:
         *  - There is unbound work on the run queue
         *  - That work isn't a thread undergoing a
         *  - context switch on an otherwise empty queue.
         *  - The CPU isn't running the idle loop.
         */
        for (ocp = cp->cpu_next_part; ocp != cp;
            ocp = ocp->cpu_next_part) {
            ASSERT(CPU_ACTIVE(ocp));

            if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
                !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
                ocp->cpu_disp->disp_nrunnable == 1) &&
                ocp->cpu_dispatch_pri != -1)
                return (1);
        }
    }
    return (0);
}

/*
 * Called when CPU enters the idle loop
 */
static void
idle_enter()
{
    cpu_t       *cp = CPU;

    new_cpu_mstate(cp, CMS_IDLE);
    CPU_STATS_ADDQ(cp, sys, idlethread, 1);
    set_idle_cpu(cp->cpu_id);   /* arch-dependent hook */
}

/*
 * Called when CPU exits the idle loop
 */
static void
idle_exit()
{
    cpu_t       *cp = CPU;

    new_cpu_mstate(cp, CMS_SYSTEM);
    unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
}

/*
 * Idle loop.
 */
void
idle()
{
    struct cpu  *cp = CPU;      /* pointer to this CPU */
    kthread_t   *t;         /* taken thread */

    idle_enter();

    /*
     * Uniprocessor version of idle loop.
     * Do this until notified that we're on an actual multiprocessor.
     */
    while (ncpus == 1) {
        if (cp->cpu_disp->disp_nrunnable == 0) {
            (*idle_cpu)();
            continue;
        }
        idle_exit();
        swtch();

        idle_enter(); /* returned from swtch */
    }

    /*
     * Multiprocessor idle loop.
     */
    for (;;) {
        /*
         * If CPU is completely quiesced by p_online(2), just wait
         * here with minimal bus traffic until put online.
         */
        while (cp->cpu_flags & CPU_QUIESCED)
            (*idle_cpu)();

        if (cp->cpu_disp->disp_nrunnable != 0) {
            idle_exit();
            swtch();
        } else {
            if (cp->cpu_flags & CPU_OFFLINE)
                continue;
            if ((t = disp_getwork(cp)) == NULL) {
                if (cp->cpu_chosen_level != -1) {
                    disp_t *dp = cp->cpu_disp;
                    disp_t *kpq;

                    disp_lock_enter(&dp->disp_lock);
                    /*
                     * Set kpq under lock to prevent
                     * migration between partitions.
                     */
                    kpq = &cp->cpu_part->cp_kp_queue;
                    if (kpq->disp_maxrunpri == -1)
                        cp->cpu_chosen_level = -1;
                    disp_lock_exit(&dp->disp_lock);
                }
                (*idle_cpu)();
                continue;
            }
            idle_exit();
            restore_mstate(t);
            swtch_to(t);
        }
        idle_enter(); /* returned from swtch/swtch_to */
    }
}


/*
 * Preempt the currently running thread in favor of the highest
 * priority thread.  The class of the current thread controls
 * where it goes on the dispatcher queues. If panicking, turn
 * preemption off.
 */
void
preempt()
{
    kthread_t   *t = curthread;
    klwp_t      *lwp = ttolwp(curthread);

    if (panicstr)
        return;

    TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");

    thread_lock(t);

    if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
        /*
         * this thread has already been chosen to be run on
         * another CPU. Clear kprunrun on this CPU since we're
         * already headed for swtch().
         */
        CPU->cpu_kprunrun = 0;
        thread_unlock_nopreempt(t);
        TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
    } else {
        if (lwp != NULL)
            lwp->lwp_ru.nivcsw++;
        CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
        THREAD_TRANSITION(t);
        CL_PREEMPT(t);
        DTRACE_SCHED(preempt);
        thread_unlock_nopreempt(t);

        TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");

        swtch();        /* clears CPU->cpu_runrun via disp() */
    }
}

extern kthread_t *thread_unpin();

/*
 * disp() - find the highest priority thread for this processor to run, and
 * set it in TS_ONPROC state so that resume() can be called to run it.
 */
static kthread_t *
disp()
{
    cpu_t       *cpup;
    disp_t      *dp;
    kthread_t   *tp;
    dispq_t     *dq;
    int     maxrunword;
    pri_t       pri;
    disp_t      *kpq;

    TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");

    cpup = CPU;
    /*
     * Find the highest priority loaded, runnable thread.
     */
    dp = cpup->cpu_disp;

reschedule:
    /*
     * If there is more important work on the global queue with a better
     * priority than the maximum on this CPU, take it now.
     */
    kpq = &cpup->cpu_part->cp_kp_queue;
    while ((pri = kpq->disp_maxrunpri) >= 0 &&
        pri >= dp->disp_maxrunpri &&
        (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
        (tp = disp_getbest(kpq)) != NULL) {
        if (disp_ratify(tp, kpq) != NULL) {
            TRACE_1(TR_FAC_DISP, TR_DISP_END,
                "disp_end:tid %p", tp);
            restore_mstate(tp);
            return (tp);
        }
    }

    disp_lock_enter(&dp->disp_lock);
    pri = dp->disp_maxrunpri;

    /*
     * If there is nothing to run, look at what's runnable on other queues.
     * Choose the idle thread if the CPU is quiesced.
     * Note that CPUs that have the CPU_OFFLINE flag set can still run
     * interrupt threads, which will be the only threads on the CPU's own
     * queue, but cannot run threads from other queues.
     */
    if (pri == -1) {
        if (!(cpup->cpu_flags & CPU_OFFLINE)) {
            disp_lock_exit(&dp->disp_lock);
            if ((tp = disp_getwork(cpup)) == NULL) {
                tp = cpup->cpu_idle_thread;
                (void) splhigh();
                THREAD_ONPROC(tp, cpup);
                cpup->cpu_dispthread = tp;
                cpup->cpu_dispatch_pri = -1;
                cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
                cpup->cpu_chosen_level = -1;
            }
        } else {
            disp_lock_exit_high(&dp->disp_lock);
            tp = cpup->cpu_idle_thread;
            THREAD_ONPROC(tp, cpup);
            cpup->cpu_dispthread = tp;
            cpup->cpu_dispatch_pri = -1;
            cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
            cpup->cpu_chosen_level = -1;
        }
        TRACE_1(TR_FAC_DISP, TR_DISP_END,
            "disp_end:tid %p", tp);
        restore_mstate(tp);
        return (tp);
    }

    dq = &dp->disp_q[pri];
    tp = dq->dq_first;

    ASSERT(tp != NULL);
    ASSERT(tp->t_schedflag & TS_LOAD);  /* thread must be swapped in */

    DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);

    /*
     * Found it so remove it from queue.
     */
    dp->disp_nrunnable--;
    dq->dq_sruncnt--;
    if ((dq->dq_first = tp->t_link) == NULL) {
        ulong_t *dqactmap = dp->disp_qactmap;

        ASSERT(dq->dq_sruncnt == 0);
        dq->dq_last = NULL;

        /*
         * The queue is empty, so the corresponding bit needs to be
         * turned off in dqactmap.   If nrunnable != 0 just took the
         * last runnable thread off the
         * highest queue, so recompute disp_maxrunpri.
         */
        maxrunword = pri >> BT_ULSHIFT;
        dqactmap[maxrunword] &= ~BT_BIW(pri);

        if (dp->disp_nrunnable == 0) {
            dp->disp_max_unbound_pri = -1;
            dp->disp_maxrunpri = -1;
        } else {
            int ipri;

            ipri = bt_gethighbit(dqactmap, maxrunword);
            dp->disp_maxrunpri = ipri;
            if (ipri < dp->disp_max_unbound_pri)
                dp->disp_max_unbound_pri = ipri;
        }
    } else {
        tp->t_link = NULL;
    }

    /*
     * Set TS_DONT_SWAP flag to prevent another processor from swapping
     * out this thread before we have a chance to run it.
     * While running, it is protected against swapping by t_lock.
     */
    tp->t_schedflag |= TS_DONT_SWAP;
    cpup->cpu_dispthread = tp;      /* protected by spl only */
    cpup->cpu_dispatch_pri = pri;
    ASSERT(pri == DISP_PRIO(tp));
    thread_onproc(tp, cpup);        /* set t_state to TS_ONPROC */
    disp_lock_exit_high(&dp->disp_lock);    /* drop run queue lock */

    ASSERT(tp != NULL);
    TRACE_1(TR_FAC_DISP, TR_DISP_END,
        "disp_end:tid %p", tp);

    if (disp_ratify(tp, kpq) == NULL)
        goto reschedule;

    restore_mstate(tp);
    return (tp);
}

/*
 * swtch()
 *  Find best runnable thread and run it.
 *  Called with the current thread already switched to a new state,
 *  on a sleep queue, run queue, stopped, and not zombied.
 *  May be called at any spl level less than or equal to LOCK_LEVEL.
 *  Always drops spl to the base level (spl0()).
 */
void
swtch()
{
    kthread_t   *t = curthread;
    kthread_t   *next;
    cpu_t       *cp;

    TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");

    if (t->t_flag & T_INTR_THREAD)
        cpu_intr_swtch_enter(t);

    if (t->t_intr != NULL) {
        /*
         * We are an interrupt thread.  Setup and return
         * the interrupted thread to be resumed.
         */
        (void) splhigh();   /* block other scheduler action */
        cp = CPU;       /* now protected against migration */
        ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
        CPU_STATS_ADDQ(cp, sys, pswitch, 1);
        CPU_STATS_ADDQ(cp, sys, intrblk, 1);
        next = thread_unpin();
        TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
        resume_from_intr(next);
    } else {
#ifdef  DEBUG
        if (t->t_state == TS_ONPROC &&
            t->t_disp_queue->disp_cpu == CPU &&
            t->t_preempt == 0) {
            thread_lock(t);
            ASSERT(t->t_state != TS_ONPROC ||
                t->t_disp_queue->disp_cpu != CPU ||
                t->t_preempt != 0); /* cannot migrate */
            thread_unlock_nopreempt(t);
        }
#endif  /* DEBUG */
        cp = CPU;
        next = disp();      /* returns with spl high */
        ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */

        /* OK to steal anything left on run queue */
        cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;

        if (next != t) {
            if (t == cp->cpu_idle_thread) {
                CHIP_NRUNNING(cp->cpu_chip, 1);
            } else if (next == cp->cpu_idle_thread) {
                CHIP_NRUNNING(cp->cpu_chip, -1);
            }

            CPU_STATS_ADDQ(cp, sys, pswitch, 1);
            cp->cpu_last_swtch = t->t_disp_time = lbolt;
            TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");

            if (dtrace_vtime_active)
                dtrace_vtime_switch(next);

            resume(next);
            /*
             * The TR_RESUME_END and TR_SWTCH_END trace points
             * appear at the end of resume(), because we may not
             * return here
             */
        } else {
            if (t->t_flag & T_INTR_THREAD)
                cpu_intr_swtch_exit(t);

            DTRACE_SCHED(remain__cpu);
            TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
            (void) spl0();
        }
    }
}

/*
 * swtch_from_zombie()
 *  Special case of swtch(), which allows checks for TS_ZOMB to be
 *  eliminated from normal resume.
 *  Find best runnable thread and run it.
 *  Called with the current thread zombied.
 *  Zombies cannot migrate, so CPU references are safe.
 */
void
swtch_from_zombie()
{
    kthread_t   *next;
    cpu_t       *cpu = CPU;

    TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");

    ASSERT(curthread->t_state == TS_ZOMB);

    next = disp();          /* returns with spl high */
    ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
    CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
    ASSERT(next != curthread);
    TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");

    if (next == cpu->cpu_idle_thread)
        CHIP_NRUNNING(cpu->cpu_chip, -1);

    if (dtrace_vtime_active)
        dtrace_vtime_switch(next);

    resume_from_zombie(next);
    /*
     * The TR_RESUME_END and TR_SWTCH_END trace points
     * appear at the end of resume(), because we certainly will not
     * return here
     */
}

#if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
static int
thread_on_queue(kthread_t *tp)
{
    cpu_t   *cp;
    cpu_t   *self;
    disp_t  *dp;

    self = CPU;
    cp = self->cpu_next_onln;
    dp = cp->cpu_disp;
    for (;;) {
        dispq_t     *dq;
        dispq_t     *eq;

        disp_lock_enter_high(&dp->disp_lock);
        for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
            kthread_t   *rp;

            ASSERT(dq->dq_last == NULL ||
                dq->dq_last->t_link == NULL);
            for (rp = dq->dq_first; rp; rp = rp->t_link)
                if (tp == rp) {
                    disp_lock_exit_high(&dp->disp_lock);
                    return (1);
                }
        }
        disp_lock_exit_high(&dp->disp_lock);
        if (cp == NULL)
            break;
        if (cp == self) {
            cp = NULL;
            dp = &cp->cpu_part->cp_kp_queue;
        } else {
            cp = cp->cpu_next_onln;
            dp = cp->cpu_disp;
        }
    }
    return (0);
}   /* end of thread_on_queue */
#else

#define thread_on_queue(tp) 0   /* ASSERT must be !thread_on_queue */

#endif  /* DEBUG */

/*
 * like swtch(), but switch to a specified thread taken from another CPU.
 *  called with spl high..
 */
void
swtch_to(kthread_t *next)
{
    cpu_t           *cp = CPU;

    TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");

    /*
     * Update context switch statistics.
     */
    CPU_STATS_ADDQ(cp, sys, pswitch, 1);

    TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");

    if (curthread == cp->cpu_idle_thread)
        CHIP_NRUNNING(cp->cpu_chip, 1);

    /* OK to steal anything left on run queue */
    cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;

    /* record last execution time */
    cp->cpu_last_swtch = curthread->t_disp_time = lbolt;

    if (dtrace_vtime_active)
        dtrace_vtime_switch(next);

    resume(next);
    /*
     * The TR_RESUME_END and TR_SWTCH_END trace points
     * appear at the end of resume(), because we may not
     * return here
     */
}


#define CPU_IDLING(pri) ((pri) == -1)

static void
cpu_resched(cpu_t *cp, pri_t tpri)
{
    int call_poke_cpu = 0;
    pri_t   cpupri = cp->cpu_dispatch_pri;

    if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
        TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
            "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
        if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
            cp->cpu_runrun = 1;
            aston(cp->cpu_dispthread);
            if (tpri < kpreemptpri && cp != CPU)
                call_poke_cpu = 1;
        }
        if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
            cp->cpu_kprunrun = 1;
            if (cp != CPU)
                call_poke_cpu = 1;
        }
    }

    /*
     * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
     */
    membar_enter();

    if (call_poke_cpu)
        poke_cpu(cp->cpu_id);
}

/*
 * Routine used by setbackdq() to balance load across the physical
 * processors. Returns a CPU of a lesser loaded chip in the lgroup
 * if balancing is necessary, or the "hint" CPU if it's not.
 *
 * - tp is the thread being enqueued
 * - cp is a hint CPU (chosen by cpu_choose()).
 * - curchip (if not NULL) is the chip on which the current thread
 *   is running.
 *
 * The thread lock for "tp" must be held while calling this routine.
 */
static cpu_t *
chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip)
{
    int chp_nrun, ochp_nrun;
    chip_t  *chp, *nchp;

    chp = cp->cpu_chip;
    chp_nrun = chp->chip_nrunning;

    if (chp == curchip)
        chp_nrun--; /* Ignore curthread */

    /*
     * If this chip isn't at all idle, then let
     * run queue balancing do the work.
     */
    if (chp_nrun == chp->chip_ncpu)
        return (cp);

    nchp = chp->chip_balance;
    do {
        if (nchp == chp ||
            !CHIP_IN_CPUPART(nchp, tp->t_cpupart))
            continue;

        ochp_nrun = nchp->chip_nrunning;

        /*
         * If the other chip is running less threads,
         * or if it's running the same number of threads, but
         * has more online logical CPUs, then choose to balance.
         */
        if (chp_nrun > ochp_nrun ||
            (chp_nrun == ochp_nrun &&
            nchp->chip_ncpu > chp->chip_ncpu)) {
            cp = nchp->chip_cpus;
            nchp->chip_cpus = cp->cpu_next_chip;

            /*
             * Find a CPU on the chip in the correct
             * partition. We know at least one exists
             * because of the CHIP_IN_CPUPART() check above.
             */
            while (cp->cpu_part != tp->t_cpupart)
                cp = cp->cpu_next_chip;
        }
        chp->chip_balance = nchp->chip_next_lgrp;
        break;
    } while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance);

    ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart));
    return (cp);
}

/*
 * setbackdq() keeps runqs balanced such that the difference in length
 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
 * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
 * try to keep runqs perfectly balanced regardless of the thread priority.
 */
#define RUNQ_MATCH_PRI  16  /* pri below which queue lengths must match */
#define RUNQ_MAX_DIFF   2   /* maximum runq length difference */
#define RUNQ_LEN(cp, pri)   ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)

/*
 * Put the specified thread on the back of the dispatcher
 * queue corresponding to its current priority.
 *
 * Called with the thread in transition, onproc or stopped state
 * and locked (transition implies locked) and at high spl.
 * Returns with the thread in TS_RUN state and still locked.
 */
void
setbackdq(kthread_t *tp)
{
    dispq_t *dq;
    disp_t      *dp;
    chip_t      *curchip = NULL;
    cpu_t       *cp;
    pri_t       tpri;
    int     bound;

    ASSERT(THREAD_LOCK_HELD(tp));
    ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);

    if (tp->t_waitrq == 0) {
        hrtime_t curtime;

        curtime = gethrtime_unscaled();
        (void) cpu_update_pct(tp, curtime);
        tp->t_waitrq = curtime;
    } else {
        (void) cpu_update_pct(tp, gethrtime_unscaled());
    }

    ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */

    /*
     * If thread is "swapped" or on the swap queue don't
     * queue it, but wake sched.
     */
    if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
        disp_swapped_setrun(tp);
        return;
    }

    tpri = DISP_PRIO(tp);
    if (tp == curthread) {
        curchip = CPU->cpu_chip;
    }

    if (ncpus == 1)
        cp = tp->t_cpu;
    else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
        if (tpri >= kpqpri) {
            setkpdq(tp, SETKP_BACK);
            return;
        }
        /*
         * Let cpu_choose suggest a CPU.
         */
        cp = cpu_choose(tp, tpri);

        if (tp->t_cpupart == cp->cpu_part) {
            int qlen;

            /*
             * Select another CPU if we need
             * to do some load balancing across the
             * physical processors.
             */
            if (CHIP_SHOULD_BALANCE(cp->cpu_chip))
                cp = chip_balance(tp, cp, curchip);

            /*
             * Balance across the run queues
             */
            qlen = RUNQ_LEN(cp, tpri);
            if (tpri >= RUNQ_MATCH_PRI &&
                !(tp->t_schedflag & TS_RUNQMATCH))
                qlen -= RUNQ_MAX_DIFF;
            if (qlen > 0) {
                cpu_t   *np;

                if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
                    np = cp->cpu_next_part;
                else {
                    if ((np = cp->cpu_next_lpl) == cp)
                        np = cp->cpu_next_part;
                }
                if (RUNQ_LEN(np, tpri) < qlen)
                    cp = np;
            }
        } else {
            /*
             * Migrate to a cpu in the new partition.
             */
            cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
                tp->t_lpl, tp->t_pri, NULL);
        }
        bound = 0;
        ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
    } else {
        /*
         * It is possible that t_weakbound_cpu != t_bound_cpu (for
         * a short time until weak binding that existed when the
         * strong binding was established has dropped) so we must
         * favour weak binding over strong.
         */
        cp = tp->t_weakbound_cpu ?
            tp->t_weakbound_cpu : tp->t_bound_cpu;
        bound = 1;
    }
    dp = cp->cpu_disp;
    disp_lock_enter_high(&dp->disp_lock);

    DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
    TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
        tpri, cp, tp);

#ifndef NPROBE
    /* Kernel probe */
    if (tnf_tracing_active)
        tnf_thread_queue(tp, cp, tpri);
#endif /* NPROBE */

    ASSERT(tpri >= 0 && tpri < dp->disp_npri);

    THREAD_RUN(tp, &dp->disp_lock);     /* set t_state to TS_RUN */
    tp->t_disp_queue = dp;
    tp->t_link = NULL;

    dq = &dp->disp_q[tpri];
    dp->disp_nrunnable++;
    membar_enter();

    if (dq->dq_sruncnt++ != 0) {
        ASSERT(dq->dq_first != NULL);
        dq->dq_last->t_link = tp;
        dq->dq_last = tp;
    } else {
        ASSERT(dq->dq_first == NULL);
        ASSERT(dq->dq_last == NULL);
        dq->dq_first = dq->dq_last = tp;
        BT_SET(dp->disp_qactmap, tpri);
        if (tpri > dp->disp_maxrunpri) {
            dp->disp_maxrunpri = tpri;
            membar_enter();
            cpu_resched(cp, tpri);
        }
    }

    if (!bound && tpri > dp->disp_max_unbound_pri) {
        if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
            cp == CPU) {
            /*
             * If there are no other unbound threads on the
             * run queue, don't allow other CPUs to steal
             * this thread while we are in the middle of a
             * context switch. We may just switch to it
             * again right away. CPU_DISP_DONTSTEAL is cleared
             * in swtch and swtch_to.
             */
            cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
        }
        dp->disp_max_unbound_pri = tpri;
    }
    (*disp_enq_thread)(cp, bound);
}

/*
 * Put the specified thread on the front of the dispatcher
 * queue corresponding to its current priority.
 *
 * Called with the thread in transition, onproc or stopped state
 * and locked (transition implies locked) and at high spl.
 * Returns with the thread in TS_RUN state and still locked.
 */
void
setfrontdq(kthread_t *tp)
{
    disp_t      *dp;
    dispq_t     *dq;
    cpu_t       *cp;
    pri_t       tpri;
    int     bound;

    ASSERT(THREAD_LOCK_HELD(tp));
    ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);

    if (tp->t_waitrq == 0) {
        hrtime_t curtime;

        curtime = gethrtime_unscaled();
        (void) cpu_update_pct(tp, curtime);
        tp->t_waitrq = curtime;
    } else {
        (void) cpu_update_pct(tp, gethrtime_unscaled());
    }

    ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */

    /*
     * If thread is "swapped" or on the swap queue don't
     * queue it, but wake sched.
     */
    if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
        disp_swapped_setrun(tp);
        return;
    }

    tpri = DISP_PRIO(tp);
    if (ncpus == 1)
        cp = tp->t_cpu;
    else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
        if (tpri >= kpqpri) {
            setkpdq(tp, SETKP_FRONT);
            return;
        }
        cp = tp->t_cpu;
        if (tp->t_cpupart == cp->cpu_part) {
            /*
             * If we are of higher or equal priority than
             * the highest priority runnable thread of
             * the current CPU, just pick this CPU.  Otherwise
             * Let cpu_choose() select the CPU.  If this cpu
             * is the target of an offline request then do not
             * pick it - a thread_nomigrate() on the in motion
             * cpu relies on this when it forces a preempt.
             */
            if (tpri < cp->cpu_disp->disp_maxrunpri ||
                cp == cpu_inmotion)
                cp = cpu_choose(tp, tpri);
        } else {
            /*
             * Migrate to a cpu in the new partition.
             */
            cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
                tp->t_lpl, tp->t_pri, NULL);
        }
        bound = 0;
        ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
    } else {
        /*
         * It is possible that t_weakbound_cpu != t_bound_cpu (for
         * a short time until weak binding that existed when the
         * strong binding was established has dropped) so we must
         * favour weak binding over strong.
         */
        cp = tp->t_weakbound_cpu ?
            tp->t_weakbound_cpu : tp->t_bound_cpu;
        bound = 1;
    }
    dp = cp->cpu_disp;
    disp_lock_enter_high(&dp->disp_lock);

    TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
    DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);

#ifndef NPROBE
    /* Kernel probe */
    if (tnf_tracing_active)
        tnf_thread_queue(tp, cp, tpri);
#endif /* NPROBE */

    ASSERT(tpri >= 0 && tpri < dp->disp_npri);

    THREAD_RUN(tp, &dp->disp_lock);     /* set TS_RUN state and lock */
    tp->t_disp_queue = dp;

    dq = &dp->disp_q[tpri];
    dp->disp_nrunnable++;
    membar_enter();

    if (dq->dq_sruncnt++ != 0) {
        ASSERT(dq->dq_last != NULL);
        tp->t_link = dq->dq_first;
        dq->dq_first = tp;
    } else {
        ASSERT(dq->dq_last == NULL);
        ASSERT(dq->dq_first == NULL);
        tp->t_link = NULL;
        dq->dq_first = dq->dq_last = tp;
        BT_SET(dp->disp_qactmap, tpri);
        if (tpri > dp->disp_maxrunpri) {
            dp->disp_maxrunpri = tpri;
            membar_enter();
            cpu_resched(cp, tpri);
        }
    }

    if (!bound && tpri > dp->disp_max_unbound_pri) {
        if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
            cp == CPU) {
            /*
             * If there are no other unbound threads on the
             * run queue, don't allow other CPUs to steal
             * this thread while we are in the middle of a
             * context switch. We may just switch to it
             * again right away. CPU_DISP_DONTSTEAL is cleared
             * in swtch and swtch_to.
             */
            cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
        }
        dp->disp_max_unbound_pri = tpri;
    }
    (*disp_enq_thread)(cp, bound);
}

/*
 * Put a high-priority unbound thread on the kp queue
 */
static void
setkpdq(kthread_t *tp, int borf)
{
    dispq_t *dq;
    disp_t  *dp;
    cpu_t   *cp;
    pri_t   tpri;

    tpri = DISP_PRIO(tp);

    dp = &tp->t_cpupart->cp_kp_queue;
    disp_lock_enter_high(&dp->disp_lock);

    TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);

    ASSERT(tpri >= 0 && tpri < dp->disp_npri);
    DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
    THREAD_RUN(tp, &dp->disp_lock);     /* set t_state to TS_RUN */
    tp->t_disp_queue = dp;
    dp->disp_nrunnable++;
    dq = &dp->disp_q[tpri];

    if (dq->dq_sruncnt++ != 0) {
        if (borf == SETKP_BACK) {
            ASSERT(dq->dq_first != NULL);
            tp->t_link = NULL;
            dq->dq_last->t_link = tp;
            dq->dq_last = tp;
        } else {
            ASSERT(dq->dq_last != NULL);
            tp->t_link = dq->dq_first;
            dq->dq_first = tp;
        }
    } else {
        if (borf == SETKP_BACK) {
            ASSERT(dq->dq_first == NULL);
            ASSERT(dq->dq_last == NULL);
            dq->dq_first = dq->dq_last = tp;
        } else {
            ASSERT(dq->dq_last == NULL);
            ASSERT(dq->dq_first == NULL);
            tp->t_link = NULL;
            dq->dq_first = dq->dq_last = tp;
        }
        BT_SET(dp->disp_qactmap, tpri);
        if (tpri > dp->disp_max_unbound_pri)
            dp->disp_max_unbound_pri = tpri;
        if (tpri > dp->disp_maxrunpri) {
            dp->disp_maxrunpri = tpri;
            membar_enter();
        }
    }

    cp = tp->t_cpu;
    if (tp->t_cpupart != cp->cpu_part) {
        /* migrate to a cpu in the new partition */
        cp = tp->t_cpupart->cp_cpulist;
    }
    cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
    disp_lock_enter_high(&cp->cpu_disp->disp_lock);
    ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);

#ifndef NPROBE
    /* Kernel probe */
    if (tnf_tracing_active)
        tnf_thread_queue(tp, cp, tpri);
#endif /* NPROBE */

    if (cp->cpu_chosen_level < tpri)
        cp->cpu_chosen_level = tpri;
    cpu_resched(cp, tpri);
    disp_lock_exit_high(&cp->cpu_disp->disp_lock);
    (*disp_enq_thread)(cp, 0);
}

/*
 * Remove a thread from the dispatcher queue if it is on it.
 * It is not an error if it is not found but we return whether
 * or not it was found in case the caller wants to check.
 */
int
dispdeq(kthread_t *tp)
{
    disp_t      *dp;
    dispq_t     *dq;
    kthread_t   *rp;
    kthread_t   *trp;
    kthread_t   **ptp;
    int     tpri;

    ASSERT(THREAD_LOCK_HELD(tp));

    if (tp->t_state != TS_RUN)
        return (0);

    /*
     * The thread is "swapped" or is on the swap queue and
     * hence no longer on the run queue, so return true.
     */
    if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
        return (1);

    tpri = DISP_PRIO(tp);
    dp = tp->t_disp_queue;
    ASSERT(tpri < dp->disp_npri);
    dq = &dp->disp_q[tpri];
    ptp = &dq->dq_first;
    rp = *ptp;
    trp = NULL;

    ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);

    /*
     * Search for thread in queue.
     * Double links would simplify this at the expense of disp/setrun.
     */
    while (rp != tp && rp != NULL) {
        trp = rp;
        ptp = &trp->t_link;
        rp = trp->t_link;
    }

    if (rp == NULL) {
        panic("dispdeq: thread not on queue");
    }

    DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);

    /*
     * Found it so remove it from queue.
     */
    if ((*ptp = rp->t_link) == NULL)
        dq->dq_last = trp;

    dp->disp_nrunnable--;
    if (--dq->dq_sruncnt == 0) {
        dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
        if (dp->disp_nrunnable == 0) {
            dp->disp_max_unbound_pri = -1;
            dp->disp_maxrunpri = -1;
        } else if (tpri == dp->disp_maxrunpri) {
            int ipri;

            ipri = bt_gethighbit(dp->disp_qactmap,
                dp->disp_maxrunpri >> BT_ULSHIFT);
            if (ipri < dp->disp_max_unbound_pri)
                dp->disp_max_unbound_pri = ipri;
            dp->disp_maxrunpri = ipri;
        }
    }
    tp->t_link = NULL;
    THREAD_TRANSITION(tp);      /* put in intermediate state */
    return (1);
}


/*
 * dq_sruninc and dq_srundec are public functions for
 * incrementing/decrementing the sruncnts when a thread on
 * a dispatcher queue is made schedulable/unschedulable by
 * resetting the TS_LOAD flag.
 *
 * The caller MUST have the thread lock and therefore the dispatcher
 * queue lock so that the operation which changes
 * the flag, the operation that checks the status of the thread to
 * determine if it's on a disp queue AND the call to this function
 * are one atomic operation with respect to interrupts.
 */

/*
 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
 */
void
dq_sruninc(kthread_t *t)
{
    ASSERT(t->t_state == TS_RUN);
    ASSERT(t->t_schedflag & TS_LOAD);

    THREAD_TRANSITION(t);
    setfrontdq(t);
}

/*
 * See comment on calling conventions above.
 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
 */
void
dq_srundec(kthread_t *t)
{
    ASSERT(t->t_schedflag & TS_LOAD);

    (void) dispdeq(t);
    disp_swapped_enq(t);
}

/*
 * Change the dispatcher lock of thread to the "swapped_lock"
 * and return with thread lock still held.
 *
 * Called with thread_lock held, in transition state, and at high spl.
 */
void
disp_swapped_enq(kthread_t *tp)
{
    ASSERT(THREAD_LOCK_HELD(tp));
    ASSERT(tp->t_schedflag & TS_LOAD);

    switch (tp->t_state) {
    case TS_RUN:
        disp_lock_enter_high(&swapped_lock);
        THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
        break;
    case TS_ONPROC:
        disp_lock_enter_high(&swapped_lock);
        THREAD_TRANSITION(tp);
        wake_sched_sec = 1;     /* tell clock to wake sched */
        THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
        break;
    default:
        panic("disp_swapped: tp: %p bad t_state", (void *)tp);
    }
}

/*
 * This routine is called by setbackdq/setfrontdq if the thread is
 * not loaded or loaded and on the swap queue.
 *
 * Thread state TS_SLEEP implies that a swapped thread
 * has been woken up and needs to be swapped in by the swapper.
 *
 * Thread state TS_RUN, it implies that the priority of a swapped
 * thread is being increased by scheduling class (e.g. ts_update).
 */
static void
disp_swapped_setrun(kthread_t *tp)
{
    ASSERT(THREAD_LOCK_HELD(tp));
    ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);

    switch (tp->t_state) {
    case TS_SLEEP:
        disp_lock_enter_high(&swapped_lock);
        /*
         * Wakeup sched immediately (i.e., next tick) if the
         * thread priority is above maxclsyspri.
         */
        if (DISP_PRIO(tp) > maxclsyspri)
            wake_sched = 1;
        else
            wake_sched_sec = 1;
        THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
        break;
    case TS_RUN:                /* called from ts_update */
        break;
    default:
        panic("disp_swapped_setrun: tp: %p bad t_state", tp);
    }
}


/*
 *  Make a thread give up its processor.  Find the processor on
 *  which this thread is executing, and have that processor
 *  preempt.
 */
void
cpu_surrender(kthread_t *tp)
{
    cpu_t   *cpup;
    int max_pri;
    int max_run_pri;
    klwp_t  *lwp;

    ASSERT(THREAD_LOCK_HELD(tp));

    if (tp->t_state != TS_ONPROC)
        return;
    cpup = tp->t_disp_queue->disp_cpu;  /* CPU thread dispatched to */
    max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
    max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
    if (max_pri < max_run_pri)
        max_pri = max_run_pri;

    cpup->cpu_runrun = 1;
    if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
        cpup->cpu_kprunrun = 1;
    }

    /*
     * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
     */
    membar_enter();

    DTRACE_SCHED1(surrender, kthread_t *, tp);

    /*
     * Make the target thread take an excursion through trap()
     * to do preempt() (unless we're already in trap or post_syscall,
     * calling cpu_surrender via CL_TRAPRET).
     */
    if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
        lwp->lwp_state != LWP_USER) {
        aston(tp);
        if (cpup != CPU)
            poke_cpu(cpup->cpu_id);
    }
    TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
        "cpu_surrender:tid %p cpu %p", tp, cpup);
}


/*
 * Commit to and ratify a scheduling decision
 */
/*ARGSUSED*/
static kthread_t *
disp_ratify(kthread_t *tp, disp_t *kpq)
{
    pri_t   tpri, maxpri;
    pri_t   maxkpri;
    cpu_t   *cpup;

    ASSERT(tp != NULL);
    /*
     * Commit to, then ratify scheduling decision
     */
    cpup = CPU;
    if (cpup->cpu_runrun != 0)
        cpup->cpu_runrun = 0;
    if (cpup->cpu_kprunrun != 0)
        cpup->cpu_kprunrun = 0;
    if (cpup->cpu_chosen_level != -1)
        cpup->cpu_chosen_level = -1;
    membar_enter();
    tpri = DISP_PRIO(tp);
    maxpri = cpup->cpu_disp->disp_maxrunpri;
    maxkpri = kpq->disp_maxrunpri;
    if (maxpri < maxkpri)
        maxpri = maxkpri;
    if (tpri < maxpri) {
        /*
         * should have done better
         * put this one back and indicate to try again
         */
        cpup->cpu_dispthread = curthread;   /* fixup dispthread */
        cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
        thread_lock_high(tp);
        THREAD_TRANSITION(tp);
        setfrontdq(tp);
        thread_unlock_nopreempt(tp);

        tp = NULL;
    }
    return (tp);
}

/*
 * See if there is any work on the dispatcher queue for other CPUs.
 * If there is, dequeue the best thread and return.
 */
static kthread_t *
disp_getwork(cpu_t *cp)
{
    cpu_t       *ocp;       /* other CPU */
    cpu_t       *ocp_start;
    cpu_t       *tcp;       /* target local CPU */
    kthread_t   *tp;
    pri_t       maxpri;
    int     s;
    disp_t      *kpq;       /* kp queue for this partition */
    lpl_t       *lpl, *lpl_leaf;
    int     hint, leafidx;

    maxpri = -1;
    tcp = NULL;

    kpq = &cp->cpu_part->cp_kp_queue;
    while (kpq->disp_maxrunpri >= 0) {
        /*
         * Try to take a thread from the kp_queue.
         */
        tp = (disp_getbest(kpq));
        if (tp)
            return (disp_ratify(tp, kpq));
    }

    s = splhigh();      /* protect the cpu_active list */

    /*
     * Try to find something to do on another CPU's run queue.
     * Loop through all other CPUs looking for the one with the highest
     * priority unbound thread.
     *
     * On NUMA machines, the partition's CPUs are consulted in order of
     * distance from the current CPU. This way, the first available
     * work found is also the closest, and will suffer the least
     * from being migrated.
     */
    lpl = lpl_leaf = cp->cpu_lpl;
    hint = leafidx = 0;

    /*
     * This loop traverses the lpl hierarchy. Higher level lpls represent
     * broader levels of locality
     */
    do {
        /* This loop iterates over the lpl's leaves */
        do {
            if (lpl_leaf != cp->cpu_lpl)
                ocp = lpl_leaf->lpl_cpus;
            else
                ocp = cp->cpu_next_lpl;

            /* This loop iterates over the CPUs in the leaf */
            ocp_start = ocp;
            do {
                pri_t pri;

                ASSERT(CPU_ACTIVE(ocp));

                /*
                 * End our stroll around the partition if:
                 *
                 * - Something became runnable on the local
                 *  queue
                 *
                 * - We're at the broadest level of locality and
                 *   we happen across another idle CPU. At the
                 *   highest level of locality, all CPUs will
                 *   walk the partition's CPUs in the same
                 *   order, so we can end our stroll taking
                 *   comfort in knowing the other idle CPU is
                 *   already covering the next portion of the
                 *   list.
                 */
                if (cp->cpu_disp->disp_nrunnable != 0)
                    break;
                if (ocp->cpu_dispatch_pri == -1) {
                    if (ocp->cpu_disp_flags &
                        CPU_DISP_HALTED)
                        continue;
                    else if (lpl->lpl_parent == NULL)
                        break;
                }

                /*
                 * If there's only one thread and the CPU
                 * is in the middle of a context switch,
                 * or it's currently running the idle thread,
                 * don't steal it.
                 */
                if ((ocp->cpu_disp_flags &
                    CPU_DISP_DONTSTEAL) &&
                    ocp->cpu_disp->disp_nrunnable == 1)
                    continue;

                pri = ocp->cpu_disp->disp_max_unbound_pri;
                if (pri > maxpri) {
                    maxpri = pri;
                    tcp = ocp;
                }
            } while ((ocp = ocp->cpu_next_lpl) != ocp_start);

            if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
                leafidx = 0;
                lpl_leaf = lpl->lpl_rset[leafidx];
            }
        } while (leafidx != hint);

        hint = leafidx = lpl->lpl_hint;
        if ((lpl = lpl->lpl_parent) != NULL)
            lpl_leaf = lpl->lpl_rset[hint];
    } while (!tcp && lpl);

    splx(s);

    /*
     * If another queue looks good, and there is still nothing on
     * the local queue, try to transfer one or more threads
     * from it to our queue.
     */
    if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
        tp = (disp_getbest(tcp->cpu_disp));
        if (tp)
            return (disp_ratify(tp, kpq));
    }
    return (NULL);
}


/*
 * disp_fix_unbound_pri()
 *  Determines the maximum priority of unbound threads on the queue.
 *  The priority is kept for the queue, but is only increased, never
 *  reduced unless some CPU is looking for something on that queue.
 *
 *  The priority argument is the known upper limit.
 *
 *  Perhaps this should be kept accurately, but that probably means
 *  separate bitmaps for bound and unbound threads.  Since only idled
 *  CPUs will have to do this recalculation, it seems better this way.
 */
static void
disp_fix_unbound_pri(disp_t *dp, pri_t pri)
{
    kthread_t   *tp;
    dispq_t     *dq;
    ulong_t     *dqactmap = dp->disp_qactmap;
    ulong_t     mapword;
    int     wx;

    ASSERT(DISP_LOCK_HELD(&dp->disp_lock));

    ASSERT(pri >= 0);           /* checked by caller */

    /*
     * Start the search at the next lowest priority below the supplied
     * priority.  This depends on the bitmap implementation.
     */
    do {
        wx = pri >> BT_ULSHIFT;     /* index of word in map */

        /*
         * Form mask for all lower priorities in the word.
         */
        mapword = dqactmap[wx] & (BT_BIW(pri) - 1);

        /*
         * Get next lower active priority.
         */
        if (mapword != 0) {
            pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
        } else if (wx > 0) {
            pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
            if (pri < 0)
                break;
        } else {
            pri = -1;
            break;
        }

        /*
         * Search the queue for unbound, runnable threads.
         */
        dq = &dp->disp_q[pri];
        tp = dq->dq_first;

        while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
            tp = tp->t_link;
        }

        /*
         * If a thread was found, set the priority and return.
         */
    } while (tp == NULL);

    /*
     * pri holds the maximum unbound thread priority or -1.
     */
    if (dp->disp_max_unbound_pri != pri)
        dp->disp_max_unbound_pri = pri;
}

/*
 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
 *  check if the CPU to which is was previously bound should have
 *  its disp_max_unbound_pri increased.
 */
void
disp_adjust_unbound_pri(kthread_t *tp)
{
    disp_t *dp;
    pri_t tpri;

    ASSERT(THREAD_LOCK_HELD(tp));

    /*
     * Don't do anything if the thread is not bound, or
     * currently not runnable or swapped out.
     */
    if (tp->t_bound_cpu == NULL ||
        tp->t_state != TS_RUN ||
        tp->t_schedflag & TS_ON_SWAPQ)
        return;

    tpri = DISP_PRIO(tp);
    dp = tp->t_bound_cpu->cpu_disp;
    ASSERT(tpri >= 0 && tpri < dp->disp_npri);
    if (tpri > dp->disp_max_unbound_pri)
        dp->disp_max_unbound_pri = tpri;
}

/*
 * disp_getbest() - de-queue the highest priority unbound runnable thread.
 *  returns with the thread unlocked and onproc
 *  but at splhigh (like disp()).
 *  returns NULL if nothing found.
 *
 *  Passed a pointer to a dispatch queue not associated with this CPU.
 */
static kthread_t *
disp_getbest(disp_t *dp)
{
    kthread_t   *tp;
    dispq_t     *dq;
    pri_t       pri;
    cpu_t       *cp;

    disp_lock_enter(&dp->disp_lock);

    /*
     * If there is nothing to run, or the CPU is in the middle of a
     * context switch of the only thread, return NULL.
     */
    pri = dp->disp_max_unbound_pri;
    if (pri == -1 ||
        (dp->disp_cpu != NULL &&
            (dp->disp_cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
        dp->disp_cpu->cpu_disp->disp_nrunnable == 1)) {
        disp_lock_exit_nopreempt(&dp->disp_lock);
        return (NULL);
    }

    dq = &dp->disp_q[pri];
    tp = dq->dq_first;

    /*
     * Skip over bound threads.
     * Bound threads can be here even though disp_max_unbound_pri
     * indicated this level.  Besides, it not always accurate because it
     * isn't reduced until another CPU looks for work.
     * Note that tp could be NULL right away due to this.
     */
    while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
        tp = tp->t_link;
    }

    /*
     * If there were no unbound threads on this queue, find the queue
     * where they are and then return NULL so that other CPUs will be
     * considered.
     */
    if (tp == NULL) {
        disp_fix_unbound_pri(dp, pri);
        disp_lock_exit_nopreempt(&dp->disp_lock);
        return (NULL);
    }

    /*
     * Found a runnable, unbound thread, so remove it from queue.
     * dispdeq() requires that we have the thread locked, and we do,
     * by virtue of holding the dispatch queue lock.  dispdeq() will
     * put the thread in transition state, thereby dropping the dispq
     * lock.
     */
#ifdef DEBUG
    {
        int thread_was_on_queue;

        thread_was_on_queue = dispdeq(tp);  /* drops disp_lock */
        ASSERT(thread_was_on_queue);
    }
#else /* DEBUG */
    (void) dispdeq(tp);         /* drops disp_lock */
#endif /* DEBUG */

    tp->t_schedflag |= TS_DONT_SWAP;

    /*
     * Setup thread to run on the current CPU.
     */
    cp = CPU;

    tp->t_disp_queue = cp->cpu_disp;

    cp->cpu_dispthread = tp;        /* protected by spl only */
    cp->cpu_dispatch_pri = pri;
    ASSERT(pri == DISP_PRIO(tp));

    thread_onproc(tp, cp);          /* set t_state to TS_ONPROC */

    /*
     * Return with spl high so that swtch() won't need to raise it.
     * The disp_lock was dropped by dispdeq().
     */

    return (tp);
}

/*
 * disp_bound_common() - common routine for higher level functions
 *  that check for bound threads under certain conditions.
 *  If 'threadlistsafe' is set then there is no need to acquire
 *  pidlock to stop the thread list from changing (eg, if
 *  disp_bound_* is called with cpus paused).
 */
static int
disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
{
    int     found = 0;
    kthread_t   *tp;

    ASSERT(flag);

    if (!threadlistsafe)
        mutex_enter(&pidlock);
    tp = curthread;     /* faster than allthreads */
    do {
        if (tp->t_state != TS_FREE) {
            /*
             * If an interrupt thread is busy, but the
             * caller doesn't care (i.e. BOUND_INTR is off),
             * then just ignore it and continue through.
             */
            if ((tp->t_flag & T_INTR_THREAD) &&
                !(flag & BOUND_INTR))
                continue;

            /*
             * Skip the idle thread for the CPU
             * we're about to set offline.
             */
            if (tp == cp->cpu_idle_thread)
                continue;

            /*
             * Skip the pause thread for the CPU
             * we're about to set offline.
             */
            if (tp == cp->cpu_pause_thread)
                continue;

            if ((flag & BOUND_CPU) &&
                (tp->t_bound_cpu == cp ||
                tp->t_bind_cpu == cp->cpu_id ||
                tp->t_weakbound_cpu == cp)) {
                found = 1;
                break;
            }

            if ((flag & BOUND_PARTITION) &&
                (tp->t_cpupart == cp->cpu_part)) {
                found = 1;
                break;
            }
        }
    } while ((tp = tp->t_next) != curthread && found == 0);
    if (!threadlistsafe)
        mutex_exit(&pidlock);
    return (found);
}

/*
 * disp_bound_threads - return nonzero if threads are bound to the processor.
 *  Called infrequently.  Keep this simple.
 *  Includes threads that are asleep or stopped but not onproc.
 */
int
disp_bound_threads(cpu_t *cp, int threadlistsafe)
{
    return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
}

/*
 * disp_bound_anythreads - return nonzero if _any_ threads are bound
 * to the given processor, including interrupt threads.
 */
int
disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
{
    return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
}

/*
 * disp_bound_partition - return nonzero if threads are bound to the same
 * partition as the processor.
 *  Called infrequently.  Keep this simple.
 *  Includes threads that are asleep or stopped but not onproc.
 */
int
disp_bound_partition(cpu_t *cp, int threadlistsafe)
{
    return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
}

/*
 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
 * threads to other CPUs.
 */
void
disp_cpu_inactive(cpu_t *cp)
{
    kthread_t   *tp;
    disp_t      *dp = cp->cpu_disp;
    dispq_t     *dq;
    pri_t       pri;
    int     wasonq;

    disp_lock_enter(&dp->disp_lock);
    while ((pri = dp->disp_max_unbound_pri) != -1) {
        dq = &dp->disp_q[pri];
        tp = dq->dq_first;

        /*
         * Skip over bound threads.
         */
        while (tp != NULL && tp->t_bound_cpu != NULL) {
            tp = tp->t_link;
        }

        if (tp == NULL) {
            /* disp_max_unbound_pri must be inaccurate, so fix it */
            disp_fix_unbound_pri(dp, pri);
            continue;
        }

        wasonq = dispdeq(tp);       /* drops disp_lock */
        ASSERT(wasonq);
        ASSERT(tp->t_weakbound_cpu == NULL);

        setbackdq(tp);
        /*
         * Called from cpu_offline:
         *
         * cp has already been removed from the list of active cpus
         * and tp->t_cpu has been changed so there is no risk of
         * tp ending up back on cp.
         *
         * Called from cpupart_move_cpu:
         *
         * The cpu has moved to a new cpupart.  Any threads that
         * were on it's dispatch queues before the move remain
         * in the old partition and can't run in the new partition.
         */
        ASSERT(tp->t_cpu != cp);
        thread_unlock(tp);

        disp_lock_enter(&dp->disp_lock);
    }
    disp_lock_exit(&dp->disp_lock);
}

/*
 * disp_lowpri_cpu - find CPU running the lowest priority thread.
 *  The hint passed in is used as a starting point so we don't favor
 *  CPU 0 or any other CPU.  The caller should pass in the most recently
 *  used CPU for the thread.
 *
 *  The lgroup and priority are used to determine the best CPU to run on
 *  in a NUMA machine.  The lgroup specifies which CPUs are closest while
 *  the thread priority will indicate whether the thread will actually run
 *  there.  To pick the best CPU, the CPUs inside and outside of the given
 *  lgroup which are running the lowest priority threads are found.  The
 *  remote CPU is chosen only if the thread will not run locally on a CPU
 *  within the lgroup, but will run on the remote CPU. If the thread
 *  cannot immediately run on any CPU, the best local CPU will be chosen.
 *
 *  The lpl specified also identifies the cpu partition from which
 *  disp_lowpri_cpu should select a CPU.
 *
 *  curcpu is used to indicate that disp_lowpri_cpu is being called on
 *      behalf of the current thread. (curthread is looking for a new cpu)
 *      In this case, cpu_dispatch_pri for this thread's cpu should be
 *      ignored.
 *
 *      If a cpu is the target of an offline request then try to avoid it.
 *
 *  This function must be called at either high SPL, or with preemption
 *  disabled, so that the "hint" CPU cannot be removed from the online
 *  CPU list while we are traversing it.
 */
cpu_t *
disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
{
    cpu_t   *bestcpu;
    cpu_t   *besthomecpu;
    cpu_t   *cp, *cpstart;

    pri_t   bestpri;
    pri_t   cpupri;

    klgrpset_t  done;
    klgrpset_t  cur_set;

    lpl_t       *lpl_iter, *lpl_leaf;
    int     i;

    /*
     * Scan for a CPU currently running the lowest priority thread.
     * Cannot get cpu_lock here because it is adaptive.
     * We do not require lock on CPU list.
     */
    ASSERT(hint != NULL);
    ASSERT(lpl != NULL);
    ASSERT(lpl->lpl_ncpu > 0);

    /*
     * First examine local CPUs. Note that it's possible the hint CPU
     * passed in in remote to the specified home lgroup. If our priority
     * isn't sufficient enough such that we can run immediately at home,
     * then examine CPUs remote to our home lgroup.
     * We would like to give preference to CPUs closest to "home".
     * If we can't find a CPU where we'll run at a given level
     * of locality, we expand our search to include the next level.
     */
    bestcpu = besthomecpu = NULL;
    klgrpset_clear(done);
    /* start with lpl we were passed */

    lpl_iter = lpl;

    do {

        bestpri = SHRT_MAX;
        klgrpset_clear(cur_set);

        for (i = 0; i < lpl_iter->lpl_nrset; i++) {
            lpl_leaf = lpl_iter->lpl_rset[i];
            if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
                continue;

            klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);

            if (hint->cpu_lpl == lpl_leaf)
                cp = cpstart = hint;
            else
                cp = cpstart = lpl_leaf->lpl_cpus;

            do {

                if (cp == curcpu)
                    cpupri = -1;
                else if (cp == cpu_inmotion)
                    cpupri = SHRT_MAX;
                else
                    cpupri = cp->cpu_dispatch_pri;

                if (cp->cpu_disp->disp_maxrunpri > cpupri)
                    cpupri = cp->cpu_disp->disp_maxrunpri;
                if (cp->cpu_chosen_level > cpupri)
                    cpupri = cp->cpu_chosen_level;
                if (cpupri < bestpri) {
                    if (CPU_IDLING(cpupri)) {
                        ASSERT((cp->cpu_flags &
                            CPU_QUIESCED) == 0);
                        return (cp);
                    }
                    bestcpu = cp;
                    bestpri = cpupri;
                }
            } while ((cp = cp->cpu_next_lpl) != cpstart);
        }

        if (bestcpu && (tpri > bestpri)) {
            ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
            return (bestcpu);
        }
        if (besthomecpu == NULL)
            besthomecpu = bestcpu;
        /*
         * Add the lgrps we just considered to the "done" set
         */
        klgrpset_or(done, cur_set);

    } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);

    /*
     * The specified priority isn't high enough to run immediately
     * anywhere, so just return the best CPU from the home lgroup.
     */
    ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
    return (besthomecpu);
}

/*
 * This routine provides the generic idle cpu function for all processors.
 * If a processor has some specific code to execute when idle (say, to stop
 * the pipeline and save power) then that routine should be defined in the
 * processors specific code (module_xx.c) and the global variable idle_cpu
 * set to that function.
 */
static void
generic_idle_cpu(void)
{
}

/*ARGSUSED*/
static void
generic_enq_thread(cpu_t *cpu, int bound)
{
}

/*
 * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
 *  - t->t_cpu is not in this thread's assigned lgrp
 *  - the time since the thread last came off t->t_cpu exceeds the
 *    rechoose time for this cpu (ignore this if t is curthread in
 *    which case it's on CPU and t->t_disp_time is inaccurate)
 *  - t->t_cpu is presently the target of an offline or partition move
 *    request
 */
static cpu_t *
cpu_choose(kthread_t *t, pri_t tpri)
{
    ASSERT(tpri < kpqpri);

    if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) &&
        t != curthread) || t->t_cpu == cpu_inmotion) {
        return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
    }

    /*
     * Take a trip through disp_lowpri_cpu() if the thread was
     * running outside it's home lgroup
     */
    if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
        t->t_cpu->cpu_lpl->lpl_lgrpid)) {
        return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
            (t == curthread) ? t->t_cpu : NULL));
    }
    return (t->t_cpu);
}