common/os/cpu.c

	cpu.c revision 004231970c4b01e49120935d0c0158cfb2ebb647
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * Architecture-independent CPU control functions.
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/var.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/kstat.h>
#include <sys/uadmin.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/cmn_err.h>
#include <sys/procset.h>
#include <sys/processor.h>
#include <sys/debug.h>
#include <sys/cpupart.h>
#include <sys/lgrp.h>
#include <sys/pset.h>
#include <sys/chip.h>
#include <sys/kmem.h>
#include <sys/kmem_impl.h>  /* to set per-cpu kmem_cache offset */
#include <sys/atomic.h>
#include <sys/callb.h>
#include <sys/vtrace.h>
#include <sys/cyclic.h>
#include <sys/bitmap.h>
#include <sys/nvpair.h>
#include <sys/pool_pset.h>
#include <sys/msacct.h>
#include <sys/time.h>
#include <sys/archsystm.h>
#if defined(__i386) || defined(__amd64)
#include <sys/x86_archext.h>
#endif

extern int  mp_cpu_start(cpu_t *);
extern int  mp_cpu_stop(cpu_t *);
extern int  mp_cpu_poweron(cpu_t *);
extern int  mp_cpu_poweroff(cpu_t *);
extern int  mp_cpu_configure(int);
extern int  mp_cpu_unconfigure(int);
extern void mp_cpu_faulted_enter(cpu_t *);
extern void mp_cpu_faulted_exit(cpu_t *);

extern int cmp_cpu_to_chip(processorid_t cpuid);
#ifdef __sparcv9
extern char *cpu_fru_fmri(cpu_t *cp);
#endif

static void cpu_add_active_internal(cpu_t *cp);
static void cpu_remove_active(cpu_t *cp);
static void cpu_info_kstat_create(cpu_t *cp);
static void cpu_info_kstat_destroy(cpu_t *cp);
static void cpu_stats_kstat_create(cpu_t *cp);
static void cpu_stats_kstat_destroy(cpu_t *cp);

static int cpu_sys_stats_ks_update(kstat_t *ksp, int rw);
static int cpu_vm_stats_ks_update(kstat_t *ksp, int rw);
static int cpu_stat_ks_update(kstat_t *ksp, int rw);
static int cpu_state_change_hooks(int, cpu_setup_t, cpu_setup_t);

/*
 * cpu_lock protects ncpus, ncpus_online, cpu_flag, cpu_list, cpu_active,
 * and dispatch queue reallocations.  The lock ordering with respect to
 * related locks is:
 *
 *  cpu_lock --> thread_free_lock  --->  p_lock  --->  thread_lock()
 *
 * Warning:  Certain sections of code do not use the cpu_lock when
 * traversing the cpu_list (e.g. mutex_vector_enter(), clock()).  Since
 * all cpus are paused during modifications to this list, a solution
 * to protect the list is too either disable kernel preemption while
 * walking the list, *or* recheck the cpu_next pointer at each
 * iteration in the loop.  Note that in no cases can any cached
 * copies of the cpu pointers be kept as they may become invalid.
 */
kmutex_t    cpu_lock;
cpu_t       *cpu_list;      /* list of all CPUs */
cpu_t       *cpu_active;        /* list of active CPUs */
static cpuset_t cpu_available;      /* set of available CPUs */
cpuset_t    cpu_seqid_inuse;    /* which cpu_seqids are in use */

/*
 * max_ncpus keeps the max cpus the system can have. Initially
 * it's NCPU, but since most archs scan the devtree for cpus
 * fairly early on during boot, the real max can be known before
 * ncpus is set (useful for early NCPU based allocations).
 */
int max_ncpus = NCPU;
/*
 * platforms that set max_ncpus to maxiumum number of cpus that can be
 * dynamically added will set boot_max_ncpus to the number of cpus found
 * at device tree scan time during boot.
 */
int boot_max_ncpus = -1;
/*
 * Maximum possible CPU id.  This can never be >= NCPU since NCPU is
 * used to size arrays that are indexed by CPU id.
 */
processorid_t max_cpuid = NCPU - 1;

int ncpus = 1;
int ncpus_online = 1;

/*
 * CPU that we're trying to offline.  Protected by cpu_lock.
 */
cpu_t *cpu_inmotion;

/*
 * Can be raised to suppress further weakbinding, which are instead
 * satisfied by disabling preemption.  Must be raised/lowered under cpu_lock,
 * while individual thread weakbinding synchronisation is done under thread
 * lock.
 */
int weakbindingbarrier;

/*
 * values for safe_list.  Pause state that CPUs are in.
 */
#define PAUSE_IDLE  0       /* normal state */
#define PAUSE_READY 1       /* paused thread ready to spl */
#define PAUSE_WAIT  2       /* paused thread is spl-ed high */
#define PAUSE_DIE   3       /* tell pause thread to leave */
#define PAUSE_DEAD  4       /* pause thread has left */

/*
 * Variables used in pause_cpus().
 */
static volatile char safe_list[NCPU];

static struct _cpu_pause_info {
    int     cp_spl;     /* spl saved in pause_cpus() */
    volatile int    cp_go;      /* Go signal sent after all ready */
    int     cp_count;   /* # of CPUs to pause */
    ksema_t     cp_sem;     /* synch pause_cpus & cpu_pause */
    kthread_id_t    cp_paused;
} cpu_pause_info;

static kmutex_t pause_free_mutex;
static kcondvar_t pause_free_cv;

static struct cpu_sys_stats_ks_data {
    kstat_named_t cpu_ticks_idle;
    kstat_named_t cpu_ticks_user;
    kstat_named_t cpu_ticks_kernel;
    kstat_named_t cpu_ticks_wait;
    kstat_named_t cpu_nsec_idle;
    kstat_named_t cpu_nsec_user;
    kstat_named_t cpu_nsec_kernel;
    kstat_named_t wait_ticks_io;
    kstat_named_t bread;
    kstat_named_t bwrite;
    kstat_named_t lread;
    kstat_named_t lwrite;
    kstat_named_t phread;
    kstat_named_t phwrite;
    kstat_named_t pswitch;
    kstat_named_t trap;
    kstat_named_t intr;
    kstat_named_t syscall;
    kstat_named_t sysread;
    kstat_named_t syswrite;
    kstat_named_t sysfork;
    kstat_named_t sysvfork;
    kstat_named_t sysexec;
    kstat_named_t readch;
    kstat_named_t writech;
    kstat_named_t rcvint;
    kstat_named_t xmtint;
    kstat_named_t mdmint;
    kstat_named_t rawch;
    kstat_named_t canch;
    kstat_named_t outch;
    kstat_named_t msg;
    kstat_named_t sema;
    kstat_named_t namei;
    kstat_named_t ufsiget;
    kstat_named_t ufsdirblk;
    kstat_named_t ufsipage;
    kstat_named_t ufsinopage;
    kstat_named_t procovf;
    kstat_named_t intrthread;
    kstat_named_t intrblk;
    kstat_named_t intrunpin;
    kstat_named_t idlethread;
    kstat_named_t inv_swtch;
    kstat_named_t nthreads;
    kstat_named_t cpumigrate;
    kstat_named_t xcalls;
    kstat_named_t mutex_adenters;
    kstat_named_t rw_rdfails;
    kstat_named_t rw_wrfails;
    kstat_named_t modload;
    kstat_named_t modunload;
    kstat_named_t bawrite;
    kstat_named_t iowait;
} cpu_sys_stats_ks_data_template = {
    { "cpu_ticks_idle",     KSTAT_DATA_UINT64 },
    { "cpu_ticks_user",     KSTAT_DATA_UINT64 },
    { "cpu_ticks_kernel",   KSTAT_DATA_UINT64 },
    { "cpu_ticks_wait",     KSTAT_DATA_UINT64 },
    { "cpu_nsec_idle",  KSTAT_DATA_UINT64 },
    { "cpu_nsec_user",  KSTAT_DATA_UINT64 },
    { "cpu_nsec_kernel",    KSTAT_DATA_UINT64 },
    { "wait_ticks_io",  KSTAT_DATA_UINT64 },
    { "bread",      KSTAT_DATA_UINT64 },
    { "bwrite",         KSTAT_DATA_UINT64 },
    { "lread",      KSTAT_DATA_UINT64 },
    { "lwrite",         KSTAT_DATA_UINT64 },
    { "phread",         KSTAT_DATA_UINT64 },
    { "phwrite",        KSTAT_DATA_UINT64 },
    { "pswitch",        KSTAT_DATA_UINT64 },
    { "trap",       KSTAT_DATA_UINT64 },
    { "intr",       KSTAT_DATA_UINT64 },
    { "syscall",        KSTAT_DATA_UINT64 },
    { "sysread",        KSTAT_DATA_UINT64 },
    { "syswrite",       KSTAT_DATA_UINT64 },
    { "sysfork",        KSTAT_DATA_UINT64 },
    { "sysvfork",       KSTAT_DATA_UINT64 },
    { "sysexec",        KSTAT_DATA_UINT64 },
    { "readch",         KSTAT_DATA_UINT64 },
    { "writech",        KSTAT_DATA_UINT64 },
    { "rcvint",         KSTAT_DATA_UINT64 },
    { "xmtint",         KSTAT_DATA_UINT64 },
    { "mdmint",         KSTAT_DATA_UINT64 },
    { "rawch",      KSTAT_DATA_UINT64 },
    { "canch",      KSTAT_DATA_UINT64 },
    { "outch",      KSTAT_DATA_UINT64 },
    { "msg",        KSTAT_DATA_UINT64 },
    { "sema",       KSTAT_DATA_UINT64 },
    { "namei",      KSTAT_DATA_UINT64 },
    { "ufsiget",        KSTAT_DATA_UINT64 },
    { "ufsdirblk",      KSTAT_DATA_UINT64 },
    { "ufsipage",       KSTAT_DATA_UINT64 },
    { "ufsinopage",     KSTAT_DATA_UINT64 },
    { "procovf",        KSTAT_DATA_UINT64 },
    { "intrthread",     KSTAT_DATA_UINT64 },
    { "intrblk",        KSTAT_DATA_UINT64 },
    { "intrunpin",      KSTAT_DATA_UINT64 },
    { "idlethread",     KSTAT_DATA_UINT64 },
    { "inv_swtch",      KSTAT_DATA_UINT64 },
    { "nthreads",       KSTAT_DATA_UINT64 },
    { "cpumigrate",     KSTAT_DATA_UINT64 },
    { "xcalls",         KSTAT_DATA_UINT64 },
    { "mutex_adenters",     KSTAT_DATA_UINT64 },
    { "rw_rdfails",     KSTAT_DATA_UINT64 },
    { "rw_wrfails",     KSTAT_DATA_UINT64 },
    { "modload",        KSTAT_DATA_UINT64 },
    { "modunload",      KSTAT_DATA_UINT64 },
    { "bawrite",        KSTAT_DATA_UINT64 },
    { "iowait",     KSTAT_DATA_UINT64 },
};

static struct cpu_vm_stats_ks_data {
    kstat_named_t pgrec;
    kstat_named_t pgfrec;
    kstat_named_t pgin;
    kstat_named_t pgpgin;
    kstat_named_t pgout;
    kstat_named_t pgpgout;
    kstat_named_t swapin;
    kstat_named_t pgswapin;
    kstat_named_t swapout;
    kstat_named_t pgswapout;
    kstat_named_t zfod;
    kstat_named_t dfree;
    kstat_named_t scan;
    kstat_named_t rev;
    kstat_named_t hat_fault;
    kstat_named_t as_fault;
    kstat_named_t maj_fault;
    kstat_named_t cow_fault;
    kstat_named_t prot_fault;
    kstat_named_t softlock;
    kstat_named_t kernel_asflt;
    kstat_named_t pgrrun;
    kstat_named_t execpgin;
    kstat_named_t execpgout;
    kstat_named_t execfree;
    kstat_named_t anonpgin;
    kstat_named_t anonpgout;
    kstat_named_t anonfree;
    kstat_named_t fspgin;
    kstat_named_t fspgout;
    kstat_named_t fsfree;
} cpu_vm_stats_ks_data_template = {
    { "pgrec",      KSTAT_DATA_UINT64 },
    { "pgfrec",     KSTAT_DATA_UINT64 },
    { "pgin",       KSTAT_DATA_UINT64 },
    { "pgpgin",     KSTAT_DATA_UINT64 },
    { "pgout",      KSTAT_DATA_UINT64 },
    { "pgpgout",        KSTAT_DATA_UINT64 },
    { "swapin",     KSTAT_DATA_UINT64 },
    { "pgswapin",       KSTAT_DATA_UINT64 },
    { "swapout",        KSTAT_DATA_UINT64 },
    { "pgswapout",      KSTAT_DATA_UINT64 },
    { "zfod",       KSTAT_DATA_UINT64 },
    { "dfree",      KSTAT_DATA_UINT64 },
    { "scan",       KSTAT_DATA_UINT64 },
    { "rev",        KSTAT_DATA_UINT64 },
    { "hat_fault",      KSTAT_DATA_UINT64 },
    { "as_fault",       KSTAT_DATA_UINT64 },
    { "maj_fault",      KSTAT_DATA_UINT64 },
    { "cow_fault",      KSTAT_DATA_UINT64 },
    { "prot_fault",     KSTAT_DATA_UINT64 },
    { "softlock",       KSTAT_DATA_UINT64 },
    { "kernel_asflt",   KSTAT_DATA_UINT64 },
    { "pgrrun",     KSTAT_DATA_UINT64 },
    { "execpgin",       KSTAT_DATA_UINT64 },
    { "execpgout",      KSTAT_DATA_UINT64 },
    { "execfree",       KSTAT_DATA_UINT64 },
    { "anonpgin",       KSTAT_DATA_UINT64 },
    { "anonpgout",      KSTAT_DATA_UINT64 },
    { "anonfree",       KSTAT_DATA_UINT64 },
    { "fspgin",     KSTAT_DATA_UINT64 },
    { "fspgout",        KSTAT_DATA_UINT64 },
    { "fsfree",     KSTAT_DATA_UINT64 },
};

/*
 * Force the specified thread to migrate to the appropriate processor.
 * Called with thread lock held, returns with it dropped.
 */
static void
force_thread_migrate(kthread_id_t tp)
{
    ASSERT(THREAD_LOCK_HELD(tp));
    if (tp == curthread) {
        THREAD_TRANSITION(tp);
        CL_SETRUN(tp);
        thread_unlock_nopreempt(tp);
        swtch();
    } else {
        if (tp->t_state == TS_ONPROC) {
            cpu_surrender(tp);
        } else if (tp->t_state == TS_RUN) {
            (void) dispdeq(tp);
            setbackdq(tp);
        }
        thread_unlock(tp);
    }
}

/*
 * Set affinity for a specified CPU.
 * A reference count is incremented and the affinity is held until the
 * reference count is decremented to zero by thread_affinity_clear().
 * This is so regions of code requiring affinity can be nested.
 * Caller needs to ensure that cpu_id remains valid, which can be
 * done by holding cpu_lock across this call, unless the caller
 * specifies CPU_CURRENT in which case the cpu_lock will be acquired
 * by thread_affinity_set and CPU->cpu_id will be the target CPU.
 */
void
thread_affinity_set(kthread_id_t t, int cpu_id)
{
    cpu_t       *cp;
    int     c;

    ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL));

    if ((c = cpu_id) == CPU_CURRENT) {
        mutex_enter(&cpu_lock);
        cpu_id = CPU->cpu_id;
    }
    /*
     * We should be asserting that cpu_lock is held here, but
     * the NCA code doesn't acquire it.  The following assert
     * should be uncommented when the NCA code is fixed.
     *
     * ASSERT(MUTEX_HELD(&cpu_lock));
     */
    ASSERT((cpu_id >= 0) && (cpu_id < NCPU));
    cp = cpu[cpu_id];
    ASSERT(cp != NULL);     /* user must provide a good cpu_id */
    /*
     * If there is already a hard affinity requested, and this affinity
     * conflicts with that, panic.
     */
    thread_lock(t);
    if (t->t_affinitycnt > 0 && t->t_bound_cpu != cp) {
        panic("affinity_set: setting %p but already bound to %p",
            (void *)cp, (void *)t->t_bound_cpu);
    }
    t->t_affinitycnt++;
    t->t_bound_cpu = cp;

    /*
     * Make sure we're running on the right CPU.
     */
    if (cp != t->t_cpu || t != curthread) {
        force_thread_migrate(t);    /* drops thread lock */
    } else {
        thread_unlock(t);
    }

    if (c == CPU_CURRENT)
        mutex_exit(&cpu_lock);
}

/*
 *  Wrapper for backward compatibility.
 */
void
affinity_set(int cpu_id)
{
    thread_affinity_set(curthread, cpu_id);
}

/*
 * Decrement the affinity reservation count and if it becomes zero,
 * clear the CPU affinity for the current thread, or set it to the user's
 * software binding request.
 */
void
thread_affinity_clear(kthread_id_t t)
{
    register processorid_t binding;

    thread_lock(t);
    if (--t->t_affinitycnt == 0) {
        if ((binding = t->t_bind_cpu) == PBIND_NONE) {
            /*
             * Adjust disp_max_unbound_pri if necessary.
             */
            disp_adjust_unbound_pri(t);
            t->t_bound_cpu = NULL;
            if (t->t_cpu->cpu_part != t->t_cpupart) {
                force_thread_migrate(t);
                return;
            }
        } else {
            t->t_bound_cpu = cpu[binding];
            /*
             * Make sure the thread is running on the bound CPU.
             */
            if (t->t_cpu != t->t_bound_cpu) {
                force_thread_migrate(t);
                return;     /* already dropped lock */
            }
        }
    }
    thread_unlock(t);
}

/*
 * Wrapper for backward compatibility.
 */
void
affinity_clear(void)
{
    thread_affinity_clear(curthread);
}

/*
 * Weak cpu affinity.  Bind to the "current" cpu for short periods
 * of time during which the thread must not block (but may be preempted).
 * Use this instead of kpreempt_disable() when it is only "no migration"
 * rather than "no preemption" semantics that are required - disabling
 * preemption holds higher priority threads off of cpu and if the
 * operation that is protected is more than momentary this is not good
 * for realtime etc.
 *
 * Weakly bound threads will not prevent a cpu from being offlined -
 * we'll only run them on the cpu to which they are weakly bound but
 * (because they do not block) we'll always be able to move them on to
 * another cpu at offline time if we give them just a short moment to
 * run during which they will unbind.  To give a cpu a chance of offlining,
 * however, we require a barrier to weak bindings that may be raised for a
 * given cpu (offline/move code may set this and then wait a short time for
 * existing weak bindings to drop); the cpu_inmotion pointer is that barrier.
 *
 * There are few restrictions on the calling context of thread_nomigrate.
 * The caller must not hold the thread lock.  Calls may be nested.
 *
 * After weakbinding a thread must not perform actions that may block.
 * In particular it must not call thread_affinity_set; calling that when
 * already weakbound is nonsensical anyway.
 *
 * If curthread is prevented from migrating for other reasons
 * (kernel preemption disabled; high pil; strongly bound; interrupt thread)
 * then the weak binding will succeed even if this cpu is the target of an
 * offline/move request.
 */
void
thread_nomigrate(void)
{
    cpu_t *cp;
    kthread_id_t t = curthread;

again:
    kpreempt_disable();
    cp = CPU;

    /*
     * A highlevel interrupt must not modify t_nomigrate or
     * t_weakbound_cpu of the thread it has interrupted.  A lowlevel
     * interrupt thread cannot migrate and we can avoid the
     * thread_lock call below by short-circuiting here.  In either
     * case we can just return since no migration is possible and
     * the condition will persist (ie, when we test for these again
     * in thread_allowmigrate they can't have changed).   Migration
     * is also impossible if we're at or above DISP_LEVEL pil.
     */
    if (CPU_ON_INTR(cp) || t->t_flag & T_INTR_THREAD ||
        getpil() >= DISP_LEVEL) {
        kpreempt_enable();
        return;
    }

    /*
     * We must be consistent with existing weak bindings.  Since we
     * may be interrupted between the increment of t_nomigrate and
     * the store to t_weakbound_cpu below we cannot assume that
     * t_weakbound_cpu will be set if t_nomigrate is.  Note that we
     * cannot assert t_weakbound_cpu == t_bind_cpu since that is not
     * always the case.
     */
    if (t->t_nomigrate && t->t_weakbound_cpu && t->t_weakbound_cpu != cp) {
        if (!panicstr)
            panic("thread_nomigrate: binding to %p but already "
                "bound to %p", (void *)cp,
                (void *)t->t_weakbound_cpu);
    }

    /*
     * At this point we have preemption disabled and we don't yet hold
     * the thread lock.  So it's possible that somebody else could
     * set t_bind_cpu here and not be able to force us across to the
     * new cpu (since we have preemption disabled).
     */
    thread_lock(curthread);

    /*
     * If further weak bindings are being (temporarily) suppressed then
     * we'll settle for disabling kernel preemption (which assures
     * no migration provided the thread does not block which it is
     * not allowed to if using thread_nomigrate).  We must remember
     * this disposition so we can take appropriate action in
     * thread_allowmigrate.  If this is a nested call and the
     * thread is already weakbound then fall through as normal.
     * We remember the decision to settle for kpreempt_disable through
     * negative nesting counting in t_nomigrate.  Once a thread has had one
     * weakbinding request satisfied in this way any further (nested)
     * requests will continue to be satisfied in the same way,
     * even if weak bindings have recommenced.
     */
    if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) {
        --t->t_nomigrate;
        thread_unlock(curthread);
        return;     /* with kpreempt_disable still active */
    }

    /*
     * We hold thread_lock so t_bind_cpu cannot change.  We could,
     * however, be running on a different cpu to which we are t_bound_cpu
     * to (as explained above).  If we grant the weak binding request
     * in that case then the dispatcher must favour our weak binding
     * over our strong (in which case, just as when preemption is
     * disabled, we can continue to run on a cpu other than the one to
     * which we are strongbound; the difference in this case is that
     * this thread can be preempted and so can appear on the dispatch
     * queues of a cpu other than the one it is strongbound to).
     *
     * If the cpu we are running on does not appear to be a current
     * offline target (we check cpu_inmotion to determine this - since
     * we don't hold cpu_lock we may not see a recent store to that,
     * so it's possible that we at times can grant a weak binding to a
     * cpu that is an offline target, but that one request will not
     * prevent the offline from succeeding) then we will always grant
     * the weak binding request.  This includes the case above where
     * we grant a weakbinding not commensurate with our strong binding.
     *
     * If our cpu does appear to be an offline target then we're inclined
     * not to grant the weakbinding request just yet - we'd prefer to
     * migrate to another cpu and grant the request there.  The
     * exceptions are those cases where going through preemption code
     * will not result in us changing cpu:
     *
     *  . interrupts have already bypassed this case (see above)
     *  . we are already weakbound to this cpu (dispatcher code will
     *    always return us to the weakbound cpu)
     *  . preemption was disabled even before we disabled it above
     *  . we are strongbound to this cpu (if we're strongbound to
     *  another and not yet running there the trip through the
     *  dispatcher will move us to the strongbound cpu and we
     *  will grant the weak binding there)
     */
    if (cp != cpu_inmotion || t->t_nomigrate > 0 || t->t_preempt > 1 ||
        t->t_bound_cpu == cp) {
        /*
         * Don't be tempted to store to t_weakbound_cpu only on
         * the first nested bind request - if we're interrupted
         * after the increment of t_nomigrate and before the
         * store to t_weakbound_cpu and the interrupt calls
         * thread_nomigrate then the assertion in thread_allowmigrate
         * would fail.
         */
        t->t_nomigrate++;
        t->t_weakbound_cpu = cp;
        membar_producer();
        thread_unlock(curthread);
        /*
         * Now that we have dropped the thread_lock another thread
         * can set our t_weakbound_cpu, and will try to migrate us
         * to the strongbound cpu (which will not be prevented by
         * preemption being disabled since we're about to enable
         * preemption).  We have granted the weakbinding to the current
         * cpu, so again we are in the position that is is is possible
         * that our weak and strong bindings differ.  Again this
         * is catered for by dispatcher code which will favour our
         * weak binding.
         */
        kpreempt_enable();
    } else {
        /*
         * Move to another cpu before granting the request by
         * forcing this thread through preemption code.  When we
         * get to set{front,back}dq called from CL_PREEMPT()
         * cpu_choose() will be used to select a cpu to queue
         * us on - that will see cpu_inmotion and take
         * steps to avoid returning us to this cpu.
         */
        cp->cpu_kprunrun = 1;
        thread_unlock(curthread);
        kpreempt_enable();  /* will call preempt() */
        goto again;
    }
}

void
thread_allowmigrate(void)
{
    kthread_id_t t = curthread;

    ASSERT(t->t_weakbound_cpu == CPU ||
        (t->t_nomigrate < 0 && t->t_preempt > 0) ||
        CPU_ON_INTR(CPU) || t->t_flag & T_INTR_THREAD ||
        getpil() >= DISP_LEVEL);

    if (CPU_ON_INTR(CPU) || (t->t_flag & T_INTR_THREAD) ||
        getpil() >= DISP_LEVEL)
        return;

    if (t->t_nomigrate < 0) {
        /*
         * This thread was granted "weak binding" in the
         * stronger form of kernel preemption disabling.
         * Undo a level of nesting for both t_nomigrate
         * and t_preempt.
         */
        ++t->t_nomigrate;
        kpreempt_enable();
    } else if (--t->t_nomigrate == 0) {
        /*
         * Time to drop the weak binding.  We need to cater
         * for the case where we're weakbound to a different
         * cpu than that to which we're strongbound (a very
         * temporary arrangement that must only persist until
         * weak binding drops).  We don't acquire thread_lock
         * here so even as this code executes t_bound_cpu
         * may be changing.  So we disable preemption and
         * a) in the case that t_bound_cpu changes while we
         * have preemption disabled kprunrun will be set
         * asynchronously, and b) if before disabling
         * preemption we were already on a different cpu to
         * our t_bound_cpu then we set kprunrun ourselves
         * to force a trip through the dispatcher when
         * preemption is enabled.
         */
        kpreempt_disable();
        if (t->t_bound_cpu &&
            t->t_weakbound_cpu != t->t_bound_cpu)
            CPU->cpu_kprunrun = 1;
        t->t_weakbound_cpu = NULL;
        membar_producer();
        kpreempt_enable();
    }
}

/*
 * weakbinding_stop can be used to temporarily cause weakbindings made
 * with thread_nomigrate to be satisfied through the stronger action of
 * kpreempt_disable.  weakbinding_start recommences normal weakbinding.
 */

void
weakbinding_stop(void)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    weakbindingbarrier = 1;
    membar_producer();  /* make visible before subsequent thread_lock */
}

void
weakbinding_start(void)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    weakbindingbarrier = 0;
}

/*
 * This routine is called to place the CPUs in a safe place so that
 * one of them can be taken off line or placed on line.  What we are
 * trying to do here is prevent a thread from traversing the list
 * of active CPUs while we are changing it or from getting placed on
 * the run queue of a CPU that has just gone off line.  We do this by
 * creating a thread with the highest possible prio for each CPU and
 * having it call this routine.  The advantage of this method is that
 * we can eliminate all checks for CPU_ACTIVE in the disp routines.
 * This makes disp faster at the expense of making p_online() slower
 * which is a good trade off.
 */
static void
cpu_pause(volatile char *safe)
{
    int s;
    struct _cpu_pause_info *cpi = &cpu_pause_info;

    ASSERT((curthread->t_bound_cpu != NULL) || (*safe == PAUSE_DIE));

    while (*safe != PAUSE_DIE) {
        *safe = PAUSE_READY;
        membar_enter();     /* make sure stores are flushed */
        sema_v(&cpi->cp_sem);   /* signal requesting thread */

        /*
         * Wait here until all pause threads are running.  That
         * indicates that it's safe to do the spl.  Until
         * cpu_pause_info.cp_go is set, we don't want to spl
         * because that might block clock interrupts needed
         * to preempt threads on other CPUs.
         */
        while (cpi->cp_go == 0)
            ;
        /*
         * Even though we are at the highest disp prio, we need
         * to block out all interrupts below LOCK_LEVEL so that
         * an intr doesn't come in, wake up a thread, and call
         * setbackdq/setfrontdq.
         */
        s = splhigh();
        /*
         * This cpu is now safe.
         */
        *safe = PAUSE_WAIT;
        membar_enter();     /* make sure stores are flushed */

        /*
         * Now we wait.  When we are allowed to continue, safe will
         * be set to PAUSE_IDLE.
         */
        while (*safe != PAUSE_IDLE)
            ;

        splx(s);
        /*
         * Waiting is at an end. Switch out of cpu_pause
         * loop and resume useful work.
         */
        swtch();
    }

    mutex_enter(&pause_free_mutex);
    *safe = PAUSE_DEAD;
    cv_broadcast(&pause_free_cv);
    mutex_exit(&pause_free_mutex);
}

/*
 * Allow the cpus to start running again.
 */
void
start_cpus()
{
    int i;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(cpu_pause_info.cp_paused);
    cpu_pause_info.cp_paused = NULL;
    for (i = 0; i < NCPU; i++)
        safe_list[i] = PAUSE_IDLE;
    membar_enter();         /* make sure stores are flushed */
    affinity_clear();
    splx(cpu_pause_info.cp_spl);
    kpreempt_enable();
}

/*
 * Allocate a pause thread for a CPU.
 */
static void
cpu_pause_alloc(cpu_t *cp)
{
    kthread_id_t    t;
    int     cpun = cp->cpu_id;

    /*
     * Note, v.v_nglobpris will not change value as long as I hold
     * cpu_lock.
     */
    t = thread_create(NULL, 0, cpu_pause, (caddr_t)&safe_list[cpun],
        0, &p0, TS_STOPPED, v.v_nglobpris - 1);
    thread_lock(t);
    t->t_bound_cpu = cp;
    t->t_disp_queue = cp->cpu_disp;
    t->t_affinitycnt = 1;
    t->t_preempt = 1;
    thread_unlock(t);
    cp->cpu_pause_thread = t;
    /*
     * Registering a thread in the callback table is usually done
     * in the initialization code of the thread.  In this
     * case, we do it right after thread creation because the
     * thread itself may never run, and we need to register the
     * fact that it is safe for cpr suspend.
     */
    CALLB_CPR_INIT_SAFE(t, "cpu_pause");
}

/*
 * Free a pause thread for a CPU.
 */
static void
cpu_pause_free(cpu_t *cp)
{
    kthread_id_t    t;
    int     cpun = cp->cpu_id;

    ASSERT(MUTEX_HELD(&cpu_lock));
    /*
     * We have to get the thread and tell him to die.
     */
    if ((t = cp->cpu_pause_thread) == NULL) {
        ASSERT(safe_list[cpun] == PAUSE_IDLE);
        return;
    }
    thread_lock(t);
    t->t_cpu = CPU;     /* disp gets upset if last cpu is quiesced. */
    t->t_bound_cpu = NULL;  /* Must un-bind; cpu may not be running. */
    t->t_pri = v.v_nglobpris - 1;
    ASSERT(safe_list[cpun] == PAUSE_IDLE);
    safe_list[cpun] = PAUSE_DIE;
    THREAD_TRANSITION(t);
    setbackdq(t);
    thread_unlock_nopreempt(t);

    /*
     * If we don't wait for the thread to actually die, it may try to
     * run on the wrong cpu as part of an actual call to pause_cpus().
     */
    mutex_enter(&pause_free_mutex);
    while (safe_list[cpun] != PAUSE_DEAD) {
        cv_wait(&pause_free_cv, &pause_free_mutex);
    }
    mutex_exit(&pause_free_mutex);
    safe_list[cpun] = PAUSE_IDLE;

    cp->cpu_pause_thread = NULL;
}

/*
 * Initialize basic structures for pausing CPUs.
 */
void
cpu_pause_init()
{
    sema_init(&cpu_pause_info.cp_sem, 0, NULL, SEMA_DEFAULT, NULL);
    /*
     * Create initial CPU pause thread.
     */
    cpu_pause_alloc(CPU);
}

/*
 * Start the threads used to pause another CPU.
 */
static int
cpu_pause_start(processorid_t cpu_id)
{
    int i;
    int cpu_count = 0;

    for (i = 0; i < NCPU; i++) {
        cpu_t       *cp;
        kthread_id_t    t;

        cp = cpu[i];
        if (!CPU_IN_SET(cpu_available, i) || (i == cpu_id)) {
            safe_list[i] = PAUSE_WAIT;
            continue;
        }

        /*
         * Skip CPU if it is quiesced or not yet started.
         */
        if ((cp->cpu_flags & (CPU_QUIESCED | CPU_READY)) != CPU_READY) {
            safe_list[i] = PAUSE_WAIT;
            continue;
        }

        /*
         * Start this CPU's pause thread.
         */
        t = cp->cpu_pause_thread;
        thread_lock(t);
        /*
         * Reset the priority, since nglobpris may have
         * changed since the thread was created, if someone
         * has loaded the RT (or some other) scheduling
         * class.
         */
        t->t_pri = v.v_nglobpris - 1;
        THREAD_TRANSITION(t);
        setbackdq(t);
        thread_unlock_nopreempt(t);
        ++cpu_count;
    }
    return (cpu_count);
}


/*
 * Pause all of the CPUs except the one we are on by creating a high
 * priority thread bound to those CPUs.
 *
 * Note that one must be extremely careful regarding code
 * executed while CPUs are paused.  Since a CPU may be paused
 * while a thread scheduling on that CPU is holding an adaptive
 * lock, code executed with CPUs paused must not acquire adaptive
 * (or low-level spin) locks.  Also, such code must not block,
 * since the thread that is supposed to initiate the wakeup may
 * never run.
 *
 * With a few exceptions, the restrictions on code executed with CPUs
 * paused match those for code executed at high-level interrupt
 * context.
 */
void
pause_cpus(cpu_t *off_cp)
{
    processorid_t   cpu_id;
    int     i;
    struct _cpu_pause_info  *cpi = &cpu_pause_info;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(cpi->cp_paused == NULL);
    cpi->cp_count = 0;
    cpi->cp_go = 0;
    for (i = 0; i < NCPU; i++)
        safe_list[i] = PAUSE_IDLE;
    kpreempt_disable();

    /*
     * If running on the cpu that is going offline, get off it.
     * This is so that it won't be necessary to rechoose a CPU
     * when done.
     */
    if (CPU == off_cp)
        cpu_id = off_cp->cpu_next_part->cpu_id;
    else
        cpu_id = CPU->cpu_id;
    affinity_set(cpu_id);

    /*
     * Start the pause threads and record how many were started
     */
    cpi->cp_count = cpu_pause_start(cpu_id);

    /*
     * Now wait for all CPUs to be running the pause thread.
     */
    while (cpi->cp_count > 0) {
        /*
         * Spin reading the count without grabbing the disp
         * lock to make sure we don't prevent the pause
         * threads from getting the lock.
         */
        while (sema_held(&cpi->cp_sem))
            ;
        if (sema_tryp(&cpi->cp_sem))
            --cpi->cp_count;
    }
    cpi->cp_go = 1;         /* all have reached cpu_pause */

    /*
     * Now wait for all CPUs to spl. (Transition from PAUSE_READY
     * to PAUSE_WAIT.)
     */
    for (i = 0; i < NCPU; i++) {
        while (safe_list[i] != PAUSE_WAIT)
            ;
    }
    cpi->cp_spl = splhigh();    /* block dispatcher on this CPU */
    cpi->cp_paused = curthread;
}

/*
 * Check whether the current thread has CPUs paused
 */
int
cpus_paused(void)
{
    if (cpu_pause_info.cp_paused != NULL) {
        ASSERT(cpu_pause_info.cp_paused == curthread);
        return (1);
    }
    return (0);
}

static cpu_t *
cpu_get_all(processorid_t cpun)
{
    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cpun >= NCPU || cpun < 0 || !CPU_IN_SET(cpu_available, cpun))
        return (NULL);
    return (cpu[cpun]);
}

/*
 * Check whether cpun is a valid processor id and whether it should be
 * visible from the current zone. If it is, return a pointer to the
 * associated CPU structure.
 */
cpu_t *
cpu_get(processorid_t cpun)
{
    cpu_t *c;

    ASSERT(MUTEX_HELD(&cpu_lock));
    c = cpu_get_all(cpun);
    if (c != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
        zone_pset_get(curproc->p_zone) != cpupart_query_cpu(c))
        return (NULL);
    return (c);
}

/*
 * The following functions should be used to check CPU states in the kernel.
 * They should be invoked with cpu_lock held.  Kernel subsystems interested
 * in CPU states should *not* use cpu_get_state() and various P_ONLINE/etc
 * states.  Those are for user-land (and system call) use only.
 */

/*
 * Determine whether the CPU is online and handling interrupts.
 */
int
cpu_is_online(cpu_t *cpu)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    return (cpu_flagged_online(cpu->cpu_flags));
}

/*
 * Determine whether the CPU is offline (this includes spare and faulted).
 */
int
cpu_is_offline(cpu_t *cpu)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    return (cpu_flagged_offline(cpu->cpu_flags));
}

/*
 * Determine whether the CPU is powered off.
 */
int
cpu_is_poweredoff(cpu_t *cpu)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    return (cpu_flagged_poweredoff(cpu->cpu_flags));
}

/*
 * Determine whether the CPU is handling interrupts.
 */
int
cpu_is_nointr(cpu_t *cpu)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    return (cpu_flagged_nointr(cpu->cpu_flags));
}

/*
 * Determine whether the CPU is active (scheduling threads).
 */
int
cpu_is_active(cpu_t *cpu)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    return (cpu_flagged_active(cpu->cpu_flags));
}

/*
 * Same as above, but these require cpu_flags instead of cpu_t pointers.
 */
int
cpu_flagged_online(cpu_flag_t cpu_flags)
{
    return (cpu_flagged_active(cpu_flags) &&
        (cpu_flags & CPU_ENABLE));
}

int
cpu_flagged_offline(cpu_flag_t cpu_flags)
{
    return (((cpu_flags & CPU_POWEROFF) == 0) &&
        ((cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY));
}

int
cpu_flagged_poweredoff(cpu_flag_t cpu_flags)
{
    return ((cpu_flags & CPU_POWEROFF) == CPU_POWEROFF);
}

int
cpu_flagged_nointr(cpu_flag_t cpu_flags)
{
    return (cpu_flagged_active(cpu_flags) &&
        (cpu_flags & CPU_ENABLE) == 0);
}

int
cpu_flagged_active(cpu_flag_t cpu_flags)
{
    return (((cpu_flags & (CPU_POWEROFF | CPU_FAULTED | CPU_SPARE)) == 0) &&
        ((cpu_flags & (CPU_READY | CPU_OFFLINE)) == CPU_READY));
}

/*
 * Bring the indicated CPU online.
 */
int
cpu_online(cpu_t *cp)
{
    int error = 0;

    /*
     * Handle on-line request.
     *  This code must put the new CPU on the active list before
     *  starting it because it will not be paused, and will start
     *  using the active list immediately.  The real start occurs
     *  when the CPU_QUIESCED flag is turned off.
     */

    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * Put all the cpus into a known safe place.
     * No mutexes can be entered while CPUs are paused.
     */
    error = mp_cpu_start(cp);   /* arch-dep hook */
    if (error == 0) {
        pause_cpus(NULL);
        cpu_add_active_internal(cp);
        if (cp->cpu_flags & CPU_FAULTED) {
            cp->cpu_flags &= ~CPU_FAULTED;
            mp_cpu_faulted_exit(cp);
        }
        cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN |
            CPU_SPARE);
        start_cpus();
        cpu_stats_kstat_create(cp);
        cpu_create_intrstat(cp);
        lgrp_kstat_create(cp);
        cpu_state_change_notify(cp->cpu_id, CPU_ON);
        cpu_intr_enable(cp);    /* arch-dep hook */
        cyclic_online(cp);
        poke_cpu(cp->cpu_id);
    }

    return (error);
}

/*
 * Take the indicated CPU offline.
 */
int
cpu_offline(cpu_t *cp, int flags)
{
    cpupart_t *pp;
    int error = 0;
    cpu_t   *ncp;
    int intr_enable;
    int cyclic_off = 0;
    int loop_count;
    int no_quiesce = 0;
    int (*bound_func)(struct cpu *, int);
    kthread_t *t;
    lpl_t   *cpu_lpl;
    proc_t  *p;
    int lgrp_diff_lpl;

    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * If we're going from faulted or spare to offline, just
     * clear these flags and update CPU state.
     */
    if (cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) {
        if (cp->cpu_flags & CPU_FAULTED) {
            cp->cpu_flags &= ~CPU_FAULTED;
            mp_cpu_faulted_exit(cp);
        }
        cp->cpu_flags &= ~CPU_SPARE;
        cpu_set_state(cp);
        return (0);
    }

    /*
     * Handle off-line request.
     */
    pp = cp->cpu_part;
    /*
     * Don't offline last online CPU in partition
     */
    if (ncpus_online <= 1 || pp->cp_ncpus <= 1 || cpu_intr_count(cp) < 2)
        return (EBUSY);
    /*
     * Unbind all thread bound to our CPU if we were asked to.
     */
    if (flags & CPU_FORCED && (error = cpu_unbind(cp->cpu_id)) != 0)
        return (error);
    /*
     * We shouldn't be bound to this CPU ourselves.
     */
    if (curthread->t_bound_cpu == cp)
        return (EBUSY);

    /*
     * Tell interested parties that this CPU is going offline.
     */
    cpu_state_change_notify(cp->cpu_id, CPU_OFF);

    /*
     * Take the CPU out of interrupt participation so we won't find
     * bound kernel threads.  If the architecture cannot completely
     * shut off interrupts on the CPU, don't quiesce it, but don't
     * run anything but interrupt thread... this is indicated by
     * the CPU_OFFLINE flag being on but the CPU_QUIESCE flag being
     * off.
     */
    intr_enable = cp->cpu_flags & CPU_ENABLE;
    if (intr_enable)
        no_quiesce = cpu_intr_disable(cp);

    /*
     * Record that we are aiming to offline this cpu.  This acts as
     * a barrier to further weak binding requests in thread_nomigrate
     * and also causes cpu_choose, disp_lowpri_cpu and setfrontdq to
     * lean away from this cpu.  Further strong bindings are already
     * avoided since we hold cpu_lock.  Since threads that are set
     * runnable around now and others coming off the target cpu are
     * directed away from the target, existing strong and weak bindings
     * (especially the latter) to the target cpu stand maximum chance of
     * being able to unbind during the short delay loop below (if other
     * unbound threads compete they may not see cpu in time to unbind
     * even if they would do so immediately.
     */
    cpu_inmotion = cp;
    membar_enter();

    /*
     * Check for kernel threads (strong or weak) bound to that CPU.
     * Strongly bound threads may not unbind, and we'll have to return
     * EBUSY.  Weakly bound threads should always disappear - we've
     * stopped more weak binding with cpu_inmotion and existing
     * bindings will drain imminently (they may not block).  Nonetheless
     * we will wait for a fixed period for all bound threads to disappear.
     * Inactive interrupt threads are OK (they'll be in TS_FREE
     * state).  If test finds some bound threads, wait a few ticks
     * to give short-lived threads (such as interrupts) chance to
     * complete.  Note that if no_quiesce is set, i.e. this cpu
     * is required to service interrupts, then we take the route
     * that permits interrupt threads to be active (or bypassed).
     */
    bound_func = no_quiesce ? disp_bound_threads : disp_bound_anythreads;

again:  for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
        if (loop_count >= 5) {
            error = EBUSY;  /* some threads still bound */
            break;
        }

        /*
         * If some threads were assigned, give them
         * a chance to complete or move.
         *
         * This assumes that the clock_thread is not bound
         * to any CPU, because the clock_thread is needed to
         * do the delay(hz/100).
         *
         * Note: we still hold the cpu_lock while waiting for
         * the next clock tick.  This is OK since it isn't
         * needed for anything else except processor_bind(2),
         * and system initialization.  If we drop the lock,
         * we would risk another p_online disabling the last
         * processor.
         */
        delay(hz/100);
    }

    if (error == 0 && cyclic_off == 0) {
        if (!cyclic_offline(cp)) {
            /*
             * We must have bound cyclics...
             */
            error = EBUSY;
            goto out;
        }
        cyclic_off = 1;
    }

    /*
     * Call mp_cpu_stop() to perform any special operations
     * needed for this machine architecture to offline a CPU.
     */
    if (error == 0)
        error = mp_cpu_stop(cp);    /* arch-dep hook */

    /*
     * If that all worked, take the CPU offline and decrement
     * ncpus_online.
     */
    if (error == 0) {
        /*
         * Put all the cpus into a known safe place.
         * No mutexes can be entered while CPUs are paused.
         */
        pause_cpus(cp);
        /*
         * Repeat the operation, if necessary, to make sure that
         * all outstanding low-level interrupts run to completion
         * before we set the CPU_QUIESCED flag.  It's also possible
         * that a thread has weak bound to the cpu despite our raising
         * cpu_inmotion above since it may have loaded that
         * value before the barrier became visible (this would have
         * to be the thread that was on the target cpu at the time
         * we raised the barrier).
         */
        if ((!no_quiesce && cp->cpu_intr_actv != 0) ||
            (*bound_func)(cp, 1)) {
            start_cpus();
            (void) mp_cpu_start(cp);
            goto again;
        }
        ncp = cp->cpu_next_part;
        cpu_lpl = cp->cpu_lpl;
        ASSERT(cpu_lpl != NULL);

        /*
         * Remove the CPU from the list of active CPUs.
         */
        cpu_remove_active(cp);

        /*
         * Walk the active process list and look for threads
         * whose home lgroup needs to be updated, or
         * the last CPU they run on is the one being offlined now.
         */

        ASSERT(curthread->t_cpu != cp);
        for (p = practive; p != NULL; p = p->p_next) {

            t = p->p_tlist;

            if (t == NULL)
                continue;

            lgrp_diff_lpl = 0;

            do {
                ASSERT(t->t_lpl != NULL);
                /*
                 * Taking last CPU in lpl offline
                 * Rehome thread if it is in this lpl
                 * Otherwise, update the count of how many
                 * threads are in this CPU's lgroup but have
                 * a different lpl.
                 */

                if (cpu_lpl->lpl_ncpu == 0) {
                    if (t->t_lpl == cpu_lpl)
                        lgrp_move_thread(t,
                            lgrp_choose(t,
                            t->t_cpupart), 0);
                    else if (t->t_lpl->lpl_lgrpid ==
                        cpu_lpl->lpl_lgrpid)
                        lgrp_diff_lpl++;
                }
                ASSERT(t->t_lpl->lpl_ncpu > 0);

                /*
                 * Update CPU last ran on if it was this CPU
                 */
                if (t->t_cpu == cp && t->t_bound_cpu != cp)
                    t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
                    t->t_pri, NULL);
                ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
                    t->t_weakbound_cpu == cp);

                t = t->t_forw;
            } while (t != p->p_tlist);

            /*
             * Didn't find any threads in the same lgroup as this
             * CPU with a different lpl, so remove the lgroup from
             * the process lgroup bitmask.
             */

            if (lgrp_diff_lpl == 0)
                klgrpset_del(p->p_lgrpset, cpu_lpl->lpl_lgrpid);
        }

        /*
         * Walk thread list looking for threads that need to be
         * rehomed, since there are some threads that are not in
         * their process's p_tlist.
         */

        t = curthread;
        do {
            ASSERT(t != NULL && t->t_lpl != NULL);

            /*
             * Rehome threads with same lpl as this CPU when this
             * is the last CPU in the lpl.
             */

            if ((cpu_lpl->lpl_ncpu == 0) && (t->t_lpl == cpu_lpl))
                lgrp_move_thread(t,
                    lgrp_choose(t, t->t_cpupart), 1);

            ASSERT(t->t_lpl->lpl_ncpu > 0);

            /*
             * Update CPU last ran on if it was this CPU
             */

            if (t->t_cpu == cp && t->t_bound_cpu != cp) {
                t->t_cpu = disp_lowpri_cpu(ncp,
                    t->t_lpl, t->t_pri, NULL);
            }
            ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
                t->t_weakbound_cpu == cp);
            t = t->t_next;

        } while (t != curthread);
        ASSERT((cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) == 0);
        cp->cpu_flags |= CPU_OFFLINE;
        disp_cpu_inactive(cp);
        if (!no_quiesce)
            cp->cpu_flags |= CPU_QUIESCED;
        ncpus_online--;
        cpu_set_state(cp);
        cpu_inmotion = NULL;
        start_cpus();
        cpu_stats_kstat_destroy(cp);
        cpu_delete_intrstat(cp);
        lgrp_kstat_destroy(cp);
    }

out:
    cpu_inmotion = NULL;

    /*
     * If we failed, re-enable interrupts.
     * Do this even if cpu_intr_disable returned an error, because
     * it may have partially disabled interrupts.
     */
    if (error && intr_enable)
        cpu_intr_enable(cp);

    /*
     * If we failed, but managed to offline the cyclic subsystem on this
     * CPU, bring it back online.
     */
    if (error && cyclic_off)
        cyclic_online(cp);

    /*
     * If we failed, we need to notify everyone that this CPU is back on.
     */
    if (error != 0)
        cpu_state_change_notify(cp->cpu_id, CPU_ON);

    return (error);
}

/*
 * Mark the indicated CPU as faulted, taking it offline.
 */
int
cpu_faulted(cpu_t *cp, int flags)
{
    int error = 0;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(!cpu_is_poweredoff(cp));

    if (cpu_is_offline(cp)) {
        cp->cpu_flags &= ~CPU_SPARE;
        cp->cpu_flags |= CPU_FAULTED;
        mp_cpu_faulted_enter(cp);
        cpu_set_state(cp);
        return (0);
    }

    if ((error = cpu_offline(cp, flags)) == 0) {
        cp->cpu_flags |= CPU_FAULTED;
        mp_cpu_faulted_enter(cp);
        cpu_set_state(cp);
    }

    return (error);
}

/*
 * Mark the indicated CPU as a spare, taking it offline.
 */
int
cpu_spare(cpu_t *cp, int flags)
{
    int error = 0;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(!cpu_is_poweredoff(cp));

    if (cpu_is_offline(cp)) {
        if (cp->cpu_flags & CPU_FAULTED) {
            cp->cpu_flags &= ~CPU_FAULTED;
            mp_cpu_faulted_exit(cp);
        }
        cp->cpu_flags |= CPU_SPARE;
        cpu_set_state(cp);
        return (0);
    }

    if ((error = cpu_offline(cp, flags)) == 0) {
        cp->cpu_flags |= CPU_SPARE;
        cpu_set_state(cp);
    }

    return (error);
}

/*
 * Take the indicated CPU from poweroff to offline.
 */
int
cpu_poweron(cpu_t *cp)
{
    int error = ENOTSUP;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(cpu_is_poweredoff(cp));

    error = mp_cpu_poweron(cp); /* arch-dep hook */
    if (error == 0)
        cpu_set_state(cp);

    return (error);
}

/*
 * Take the indicated CPU from any inactive state to powered off.
 */
int
cpu_poweroff(cpu_t *cp)
{
    int error = ENOTSUP;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(cpu_is_offline(cp));

    if (!(cp->cpu_flags & CPU_QUIESCED))
        return (EBUSY);     /* not completely idle */

    error = mp_cpu_poweroff(cp);    /* arch-dep hook */
    if (error == 0)
        cpu_set_state(cp);

    return (error);
}

/*
 * Initialize the CPU lists for the first CPU.
 */
void
cpu_list_init(cpu_t *cp)
{
    cp->cpu_next = cp;
    cp->cpu_prev = cp;
    cpu_list = cp;

    cp->cpu_next_onln = cp;
    cp->cpu_prev_onln = cp;
    cpu_active = cp;

    cp->cpu_seqid = 0;
    CPUSET_ADD(cpu_seqid_inuse, 0);
    cp->cpu_cache_offset = KMEM_CACHE_SIZE(cp->cpu_seqid);
    cp_default.cp_cpulist = cp;
    cp_default.cp_ncpus = 1;
    cp->cpu_next_part = cp;
    cp->cpu_prev_part = cp;
    cp->cpu_part = &cp_default;

    CPUSET_ADD(cpu_available, cp->cpu_id);
}

/*
 * Insert a CPU into the list of available CPUs.
 */
void
cpu_add_unit(cpu_t *cp)
{
    int seqid;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(cpu_list != NULL);   /* list started in cpu_list_init */

    lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)cp, 0);

    /*
     * Note: most users of the cpu_list will grab the
     * cpu_lock to insure that it isn't modified.  However,
     * certain users can't or won't do that.  To allow this
     * we pause the other cpus.  Users who walk the list
     * without cpu_lock, must disable kernel preemption
     * to insure that the list isn't modified underneath
     * them.  Also, any cached pointers to cpu structures
     * must be revalidated by checking to see if the
     * cpu_next pointer points to itself.  This check must
     * be done with the cpu_lock held or kernel preemption
     * disabled.  This check relies upon the fact that
     * old cpu structures are not free'ed or cleared after
     * then are removed from the cpu_list.
     *
     * Note that the clock code walks the cpu list dereferencing
     * the cpu_part pointer, so we need to initialize it before
     * adding the cpu to the list.
     */
    cp->cpu_part = &cp_default;
    (void) pause_cpus(NULL);
    cp->cpu_next = cpu_list;
    cp->cpu_prev = cpu_list->cpu_prev;
    cpu_list->cpu_prev->cpu_next = cp;
    cpu_list->cpu_prev = cp;
    start_cpus();

    for (seqid = 0; CPU_IN_SET(cpu_seqid_inuse, seqid); seqid++)
        continue;
    CPUSET_ADD(cpu_seqid_inuse, seqid);
    cp->cpu_seqid = seqid;
    ASSERT(ncpus < max_ncpus);
    ncpus++;
    cp->cpu_cache_offset = KMEM_CACHE_SIZE(cp->cpu_seqid);
    cpu[cp->cpu_id] = cp;
    CPUSET_ADD(cpu_available, cp->cpu_id);

    /*
     * allocate a pause thread for this CPU.
     */
    cpu_pause_alloc(cp);

    /*
     * So that new CPUs won't have NULL prev_onln and next_onln pointers,
     * link them into a list of just that CPU.
     * This is so that disp_lowpri_cpu will work for thread_create in
     * pause_cpus() when called from the startup thread in a new CPU.
     */
    cp->cpu_next_onln = cp;
    cp->cpu_prev_onln = cp;
    cpu_info_kstat_create(cp);
    cp->cpu_next_part = cp;
    cp->cpu_prev_part = cp;

    init_cpu_mstate(cp, CMS_SYSTEM);

    pool_pset_mod = gethrtime();
}

/*
 * Do the opposite of cpu_add_unit().
 */
void
cpu_del_unit(int cpuid)
{
    struct cpu  *cp, *cpnext;

    ASSERT(MUTEX_HELD(&cpu_lock));
    cp = cpu[cpuid];
    ASSERT(cp != NULL);

    ASSERT(cp->cpu_next_onln == cp);
    ASSERT(cp->cpu_prev_onln == cp);
    ASSERT(cp->cpu_next_part == cp);
    ASSERT(cp->cpu_prev_part == cp);

    chip_cpu_fini(cp);

    /*
     * Destroy kstat stuff.
     */
    cpu_info_kstat_destroy(cp);
    term_cpu_mstate(cp);
    /*
     * Free up pause thread.
     */
    cpu_pause_free(cp);
    CPUSET_DEL(cpu_available, cp->cpu_id);
    cpu[cp->cpu_id] = NULL;
    /*
     * The clock thread and mutex_vector_enter cannot hold the
     * cpu_lock while traversing the cpu list, therefore we pause
     * all other threads by pausing the other cpus. These, and any
     * other routines holding cpu pointers while possibly sleeping
     * must be sure to call kpreempt_disable before processing the
     * list and be sure to check that the cpu has not been deleted
     * after any sleeps (check cp->cpu_next != NULL). We guarantee
     * to keep the deleted cpu structure around.
     *
     * Note that this MUST be done AFTER cpu_available
     * has been updated so that we don't waste time
     * trying to pause the cpu we're trying to delete.
     */
    (void) pause_cpus(NULL);

    cpnext = cp->cpu_next;
    cp->cpu_prev->cpu_next = cp->cpu_next;
    cp->cpu_next->cpu_prev = cp->cpu_prev;
    if (cp == cpu_list)
        cpu_list = cpnext;

    /*
     * Signals that the cpu has been deleted (see above).
     */
    cp->cpu_next = NULL;
    cp->cpu_prev = NULL;

    start_cpus();

    CPUSET_DEL(cpu_seqid_inuse, cp->cpu_seqid);
    ncpus--;
    lgrp_config(LGRP_CONFIG_CPU_DEL, (uintptr_t)cp, 0);

    pool_pset_mod = gethrtime();
}

/*
 * Add a CPU to the list of active CPUs.
 *  This routine must not get any locks, because other CPUs are paused.
 */
static void
cpu_add_active_internal(cpu_t *cp)
{
    cpupart_t   *pp = cp->cpu_part;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(cpu_list != NULL);   /* list started in cpu_list_init */

    ncpus_online++;
    cpu_set_state(cp);
    cp->cpu_next_onln = cpu_active;
    cp->cpu_prev_onln = cpu_active->cpu_prev_onln;
    cpu_active->cpu_prev_onln->cpu_next_onln = cp;
    cpu_active->cpu_prev_onln = cp;

    if (pp->cp_cpulist) {
        cp->cpu_next_part = pp->cp_cpulist;
        cp->cpu_prev_part = pp->cp_cpulist->cpu_prev_part;
        pp->cp_cpulist->cpu_prev_part->cpu_next_part = cp;
        pp->cp_cpulist->cpu_prev_part = cp;
    } else {
        ASSERT(pp->cp_ncpus == 0);
        pp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
    }
    pp->cp_ncpus++;
    if (pp->cp_ncpus == 1) {
        cp_numparts_nonempty++;
        ASSERT(cp_numparts_nonempty != 0);
    }

    chip_cpu_assign(cp);

    lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)cp, 0);

    bzero(&cp->cpu_loadavg, sizeof (cp->cpu_loadavg));
}

/*
 * Add a CPU to the list of active CPUs.
 *  This is called from machine-dependent layers when a new CPU is started.
 */
void
cpu_add_active(cpu_t *cp)
{
    pause_cpus(NULL);
    cpu_add_active_internal(cp);
    start_cpus();
    cpu_stats_kstat_create(cp);
    cpu_create_intrstat(cp);
    lgrp_kstat_create(cp);
    cpu_state_change_notify(cp->cpu_id, CPU_INIT);
}


/*
 * Remove a CPU from the list of active CPUs.
 *  This routine must not get any locks, because other CPUs are paused.
 */
/* ARGSUSED */
static void
cpu_remove_active(cpu_t *cp)
{
    cpupart_t   *pp = cp->cpu_part;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(cp->cpu_next_onln != cp);    /* not the last one */
    ASSERT(cp->cpu_prev_onln != cp);    /* not the last one */

    chip_cpu_unassign(cp);

    lgrp_config(LGRP_CONFIG_CPU_OFFLINE, (uintptr_t)cp, 0);

    cp->cpu_prev_onln->cpu_next_onln = cp->cpu_next_onln;
    cp->cpu_next_onln->cpu_prev_onln = cp->cpu_prev_onln;
    if (cpu_active == cp) {
        cpu_active = cp->cpu_next_onln;
    }
    cp->cpu_next_onln = cp;
    cp->cpu_prev_onln = cp;

    cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
    cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
    if (pp->cp_cpulist == cp) {
        pp->cp_cpulist = cp->cpu_next_part;
        ASSERT(pp->cp_cpulist != cp);
    }
    cp->cpu_next_part = cp;
    cp->cpu_prev_part = cp;
    pp->cp_ncpus--;
    if (pp->cp_ncpus == 0) {
        cp_numparts_nonempty--;
        ASSERT(cp_numparts_nonempty != 0);
    }
}

/*
 * Routine used to setup a newly inserted CPU in preparation for starting
 * it running code.
 */
int
cpu_configure(int cpuid)
{
    int retval = 0;

    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * Some structures are statically allocated based upon
     * the maximum number of cpus the system supports.  Do not
     * try to add anything beyond this limit.
     */
    if (cpuid < 0 || cpuid >= NCPU) {
        return (EINVAL);
    }

    if ((cpu[cpuid] != NULL) && (cpu[cpuid]->cpu_flags != 0)) {
        return (EALREADY);
    }

    if ((retval = mp_cpu_configure(cpuid)) != 0) {
        return (retval);
    }

    cpu[cpuid]->cpu_flags = CPU_QUIESCED | CPU_OFFLINE | CPU_POWEROFF;
    cpu_set_state(cpu[cpuid]);
    retval = cpu_state_change_hooks(cpuid, CPU_CONFIG, CPU_UNCONFIG);
    if (retval != 0)
        (void) mp_cpu_unconfigure(cpuid);

    return (retval);
}

/*
 * Routine used to cleanup a CPU that has been powered off.  This will
 * destroy all per-cpu information related to this cpu.
 */
int
cpu_unconfigure(int cpuid)
{
    int error;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cpu[cpuid] == NULL) {
        return (ENODEV);
    }

    if (cpu[cpuid]->cpu_flags == 0) {
        return (EALREADY);
    }

    if ((cpu[cpuid]->cpu_flags & CPU_POWEROFF) == 0) {
        return (EBUSY);
    }

    if (cpu[cpuid]->cpu_props != NULL) {
        (void) nvlist_free(cpu[cpuid]->cpu_props);
        cpu[cpuid]->cpu_props = NULL;
    }

    error = cpu_state_change_hooks(cpuid, CPU_UNCONFIG, CPU_CONFIG);

    if (error != 0)
        return (error);

    return (mp_cpu_unconfigure(cpuid));
}

/*
 * Routines for registering and de-registering cpu_setup callback functions.
 *
 * Caller's context
 *  These routines must not be called from a driver's attach(9E) or
 *  detach(9E) entry point.
 *
 * NOTE: CPU callbacks should not block. They are called with cpu_lock held.
 */

/*
 * Ideally, these would be dynamically allocated and put into a linked
 * list; however that is not feasible because the registration routine
 * has to be available before the kmem allocator is working (in fact,
 * it is called by the kmem allocator init code).  In any case, there
 * are quite a few extra entries for future users.
 */
#define NCPU_SETUPS 20

struct cpu_setup {
    cpu_setup_func_t *func;
    void *arg;
} cpu_setups[NCPU_SETUPS];

void
register_cpu_setup_func(cpu_setup_func_t *func, void *arg)
{
    int i;

    ASSERT(MUTEX_HELD(&cpu_lock));

    for (i = 0; i < NCPU_SETUPS; i++)
        if (cpu_setups[i].func == NULL)
            break;
    if (i >= NCPU_SETUPS)
        cmn_err(CE_PANIC, "Ran out of cpu_setup callback entries");

    cpu_setups[i].func = func;
    cpu_setups[i].arg = arg;
}

void
unregister_cpu_setup_func(cpu_setup_func_t *func, void *arg)
{
    int i;

    ASSERT(MUTEX_HELD(&cpu_lock));

    for (i = 0; i < NCPU_SETUPS; i++)
        if ((cpu_setups[i].func == func) &&
            (cpu_setups[i].arg == arg))
            break;
    if (i >= NCPU_SETUPS)
        cmn_err(CE_PANIC, "Could not find cpu_setup callback to "
            "deregister");

    cpu_setups[i].func = NULL;
    cpu_setups[i].arg = 0;
}

/*
 * Call any state change hooks for this CPU, ignore any errors.
 */
void
cpu_state_change_notify(int id, cpu_setup_t what)
{
    int i;

    ASSERT(MUTEX_HELD(&cpu_lock));

    for (i = 0; i < NCPU_SETUPS; i++) {
        if (cpu_setups[i].func != NULL) {
            cpu_setups[i].func(what, id, cpu_setups[i].arg);
        }
    }
}

/*
 * Call any state change hooks for this CPU, undo it if error found.
 */
static int
cpu_state_change_hooks(int id, cpu_setup_t what, cpu_setup_t undo)
{
    int i;
    int retval = 0;

    ASSERT(MUTEX_HELD(&cpu_lock));

    for (i = 0; i < NCPU_SETUPS; i++) {
        if (cpu_setups[i].func != NULL) {
            retval = cpu_setups[i].func(what, id,
                cpu_setups[i].arg);
            if (retval) {
                for (i--; i >= 0; i--) {
                    if (cpu_setups[i].func != NULL)
                        cpu_setups[i].func(undo,
                            id, cpu_setups[i].arg);
                }
                break;
            }
        }
    }
    return (retval);
}

/*
 * Export information about this CPU via the kstat mechanism.
 */
static struct {
    kstat_named_t ci_state;
    kstat_named_t ci_state_begin;
    kstat_named_t ci_cpu_type;
    kstat_named_t ci_fpu_type;
    kstat_named_t ci_clock_MHz;
    kstat_named_t ci_chip_id;
    kstat_named_t ci_implementation;
    kstat_named_t ci_brandstr;
    kstat_named_t ci_core_id;
#if defined(__sparcv9)
    kstat_named_t ci_device_ID;
    kstat_named_t ci_cpu_fru;
#endif
#if defined(__i386) || defined(__amd64)
    kstat_named_t ci_vendorstr;
    kstat_named_t ci_family;
    kstat_named_t ci_model;
    kstat_named_t ci_step;
    kstat_named_t ci_clogid;
#endif
} cpu_info_template = {
    { "state",      KSTAT_DATA_CHAR },
    { "state_begin",    KSTAT_DATA_LONG },
    { "cpu_type",       KSTAT_DATA_CHAR },
    { "fpu_type",       KSTAT_DATA_CHAR },
    { "clock_MHz",      KSTAT_DATA_LONG },
    { "chip_id",        KSTAT_DATA_LONG },
    { "implementation", KSTAT_DATA_STRING },
    { "brand",      KSTAT_DATA_STRING },
    { "core_id",        KSTAT_DATA_LONG },
#if defined(__sparcv9)
    { "device_ID",      KSTAT_DATA_UINT64 },
    { "cpu_fru",        KSTAT_DATA_STRING },
#endif
#if defined(__i386) || defined(__amd64)
    { "vendor_id",      KSTAT_DATA_STRING },
    { "family",     KSTAT_DATA_INT32 },
    { "model",      KSTAT_DATA_INT32 },
    { "stepping",       KSTAT_DATA_INT32 },
    { "clog_id",        KSTAT_DATA_INT32 },
#endif
};

static kmutex_t cpu_info_template_lock;

static int
cpu_info_kstat_update(kstat_t *ksp, int rw)
{
    cpu_t   *cp = ksp->ks_private;
    const char *pi_state;

    if (rw == KSTAT_WRITE)
        return (EACCES);

    switch (cp->cpu_type_info.pi_state) {
    case P_ONLINE:
        pi_state = PS_ONLINE;
        break;
    case P_POWEROFF:
        pi_state = PS_POWEROFF;
        break;
    case P_NOINTR:
        pi_state = PS_NOINTR;
        break;
    case P_FAULTED:
        pi_state = PS_FAULTED;
        break;
    case P_SPARE:
        pi_state = PS_SPARE;
        break;
    case P_OFFLINE:
        pi_state = PS_OFFLINE;
        break;
    default:
        pi_state = "unknown";
    }
    (void) strcpy(cpu_info_template.ci_state.value.c, pi_state);
    cpu_info_template.ci_state_begin.value.l = cp->cpu_state_begin;
    (void) strncpy(cpu_info_template.ci_cpu_type.value.c,
        cp->cpu_type_info.pi_processor_type, 15);
    (void) strncpy(cpu_info_template.ci_fpu_type.value.c,
        cp->cpu_type_info.pi_fputypes, 15);
    cpu_info_template.ci_clock_MHz.value.l = cp->cpu_type_info.pi_clock;
    cpu_info_template.ci_chip_id.value.l = chip_plat_get_chipid(cp);
    kstat_named_setstr(&cpu_info_template.ci_implementation,
        cp->cpu_idstr);
    kstat_named_setstr(&cpu_info_template.ci_brandstr, cp->cpu_brandstr);
    cpu_info_template.ci_core_id.value.l = chip_plat_get_coreid(cp);

#if defined(__sparcv9)
    cpu_info_template.ci_device_ID.value.ui64 =
        cpunodes[cp->cpu_id].device_id;
    kstat_named_setstr(&cpu_info_template.ci_cpu_fru, cpu_fru_fmri(cp));
#endif
#if defined(__i386) || defined(__amd64)
    kstat_named_setstr(&cpu_info_template.ci_vendorstr,
        cpuid_getvendorstr(cp));
    cpu_info_template.ci_family.value.l = cpuid_getfamily(cp);
    cpu_info_template.ci_model.value.l = cpuid_getmodel(cp);
    cpu_info_template.ci_step.value.l = cpuid_getstep(cp);
    cpu_info_template.ci_clogid.value.l = chip_plat_get_clogid(cp);
#endif

    return (0);
}

static void
cpu_info_kstat_create(cpu_t *cp)
{
    zoneid_t zoneid;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (pool_pset_enabled())
        zoneid = GLOBAL_ZONEID;
    else
        zoneid = ALL_ZONES;
    if ((cp->cpu_info_kstat = kstat_create_zone("cpu_info", cp->cpu_id,
        NULL, "misc", KSTAT_TYPE_NAMED,
            sizeof (cpu_info_template) / sizeof (kstat_named_t),
            KSTAT_FLAG_VIRTUAL, zoneid)) != NULL) {
        cp->cpu_info_kstat->ks_data_size += 2 * CPU_IDSTRLEN;
#if defined(__sparcv9)
        cp->cpu_info_kstat->ks_data_size +=
            strlen(cpu_fru_fmri(cp)) + 1;
#endif
#if defined(__i386) || defined(__amd64)
        cp->cpu_info_kstat->ks_data_size += X86_VENDOR_STRLEN;
#endif
        cp->cpu_info_kstat->ks_lock = &cpu_info_template_lock;
        cp->cpu_info_kstat->ks_data = &cpu_info_template;
        cp->cpu_info_kstat->ks_private = cp;
        cp->cpu_info_kstat->ks_update = cpu_info_kstat_update;
        kstat_install(cp->cpu_info_kstat);
    }
}

static void
cpu_info_kstat_destroy(cpu_t *cp)
{
    ASSERT(MUTEX_HELD(&cpu_lock));

    kstat_delete(cp->cpu_info_kstat);
    cp->cpu_info_kstat = NULL;
}

/*
 * Create and install kstats for the boot CPU.
 */
void
cpu_kstat_init(cpu_t *cp)
{
    mutex_enter(&cpu_lock);
    cpu_info_kstat_create(cp);
    cpu_stats_kstat_create(cp);
    cpu_create_intrstat(cp);
    chip_kstat_create(cp->cpu_chip);
    cpu_set_state(cp);
    mutex_exit(&cpu_lock);
}

/*
 * Make visible to the zone that subset of the cpu information that would be
 * initialized when a cpu is configured (but still offline).
 */
void
cpu_visibility_configure(cpu_t *cp, zone_t *zone)
{
    zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(pool_pset_enabled());
    ASSERT(cp != NULL);

    if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
        zone->zone_ncpus++;
        ASSERT(zone->zone_ncpus <= ncpus);
    }
    if (cp->cpu_info_kstat != NULL)
        kstat_zone_add(cp->cpu_info_kstat, zoneid);
}

/*
 * Make visible to the zone that subset of the cpu information that would be
 * initialized when a previously configured cpu is onlined.
 */
void
cpu_visibility_online(cpu_t *cp, zone_t *zone)
{
    kstat_t *ksp;
    char name[sizeof ("cpu_stat") + 10];    /* enough for 32-bit cpuids */
    zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
    processorid_t cpun;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(pool_pset_enabled());
    ASSERT(cp != NULL);
    ASSERT(cpu_is_active(cp));

    cpun = cp->cpu_id;
    if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
        zone->zone_ncpus_online++;
        ASSERT(zone->zone_ncpus_online <= ncpus_online);
    }
    (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun);
    if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES))
        != NULL) {
        kstat_zone_add(ksp, zoneid);
        kstat_rele(ksp);
    }
    if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) {
        kstat_zone_add(ksp, zoneid);
        kstat_rele(ksp);
    }
    if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) {
        kstat_zone_add(ksp, zoneid);
        kstat_rele(ksp);
    }
    if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) !=
        NULL) {
        kstat_zone_add(ksp, zoneid);
        kstat_rele(ksp);
    }
}

/*
 * Update relevant kstats such that cpu is now visible to processes
 * executing in specified zone.
 */
void
cpu_visibility_add(cpu_t *cp, zone_t *zone)
{
    cpu_visibility_configure(cp, zone);
    if (cpu_is_active(cp))
        cpu_visibility_online(cp, zone);
}

/*
 * Make invisible to the zone that subset of the cpu information that would be
 * torn down when a previously offlined cpu is unconfigured.
 */
void
cpu_visibility_unconfigure(cpu_t *cp, zone_t *zone)
{
    zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(pool_pset_enabled());
    ASSERT(cp != NULL);

    if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
        ASSERT(zone->zone_ncpus != 0);
        zone->zone_ncpus--;
    }
    if (cp->cpu_info_kstat)
        kstat_zone_remove(cp->cpu_info_kstat, zoneid);
}

/*
 * Make invisible to the zone that subset of the cpu information that would be
 * torn down when a cpu is offlined (but still configured).
 */
void
cpu_visibility_offline(cpu_t *cp, zone_t *zone)
{
    kstat_t *ksp;
    char name[sizeof ("cpu_stat") + 10];    /* enough for 32-bit cpuids */
    zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
    processorid_t cpun;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(pool_pset_enabled());
    ASSERT(cp != NULL);
    ASSERT(cpu_is_active(cp));

    cpun = cp->cpu_id;
    if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
        ASSERT(zone->zone_ncpus_online != 0);
        zone->zone_ncpus_online--;
    }

    if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) !=
        NULL) {
        kstat_zone_remove(ksp, zoneid);
        kstat_rele(ksp);
    }
    if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) {
        kstat_zone_remove(ksp, zoneid);
        kstat_rele(ksp);
    }
    if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) {
        kstat_zone_remove(ksp, zoneid);
        kstat_rele(ksp);
    }
    (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun);
    if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES))
        != NULL) {
        kstat_zone_remove(ksp, zoneid);
        kstat_rele(ksp);
    }
}

/*
 * Update relevant kstats such that cpu is no longer visible to processes
 * executing in specified zone.
 */
void
cpu_visibility_remove(cpu_t *cp, zone_t *zone)
{
    if (cpu_is_active(cp))
        cpu_visibility_offline(cp, zone);
    cpu_visibility_unconfigure(cp, zone);
}

/*
 * Bind a thread to a CPU as requested.
 */
int
cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,
    int *error)
{
    processorid_t   binding;
    cpu_t       *cp;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));

    thread_lock(tp);

    /*
     * Record old binding, but change the obind, which was initialized
     * to PBIND_NONE, only if this thread has a binding.  This avoids
     * reporting PBIND_NONE for a process when some LWPs are bound.
     */
    binding = tp->t_bind_cpu;
    if (binding != PBIND_NONE)
        *obind = binding;   /* record old binding */

    if (bind == PBIND_QUERY) {
        thread_unlock(tp);
        return (0);
    }

    /*
     * If this thread/LWP cannot be bound because of permission
     * problems, just note that and return success so that the
     * other threads/LWPs will be bound.  This is the way
     * processor_bind() is defined to work.
     *
     * Binding will get EPERM if the thread is of system class
     * or hasprocperm() fails.
     */
    if (tp->t_cid == 0 || !hasprocperm(tp->t_cred, CRED())) {
        *error = EPERM;
        thread_unlock(tp);
        return (0);
    }

    binding = bind;
    if (binding != PBIND_NONE) {
        cp = cpu[binding];
        /*
         * Make sure binding is in right partition.
         */
        if (tp->t_cpupart != cp->cpu_part) {
            *error = EINVAL;
            thread_unlock(tp);
            return (0);
        }
    }
    tp->t_bind_cpu = binding;   /* set new binding */

    /*
     * If there is no system-set reason for affinity, set
     * the t_bound_cpu field to reflect the binding.
     */
    if (tp->t_affinitycnt == 0) {
        if (binding == PBIND_NONE) {
            /*
             * We may need to adjust disp_max_unbound_pri
             * since we're becoming unbound.
             */
            disp_adjust_unbound_pri(tp);

            tp->t_bound_cpu = NULL; /* set new binding */

            /*
             * Move thread to lgroup with strongest affinity
             * after unbinding
             */
            if (tp->t_lgrp_affinity)
                lgrp_move_thread(tp,
                    lgrp_choose(tp, tp->t_cpupart), 1);

            if (tp->t_state == TS_ONPROC &&
                tp->t_cpu->cpu_part != tp->t_cpupart)
                cpu_surrender(tp);
        } else {
            lpl_t   *lpl;

            tp->t_bound_cpu = cp;
            ASSERT(cp->cpu_lpl != NULL);

            /*
             * Set home to lgroup with most affinity containing CPU
             * that thread is being bound or minimum bounding
             * lgroup if no affinities set
             */
            if (tp->t_lgrp_affinity)
                lpl = lgrp_affinity_best(tp, tp->t_cpupart, 0);
            else
                lpl = cp->cpu_lpl;

            if (tp->t_lpl != lpl) {
                /* can't grab cpu_lock */
                lgrp_move_thread(tp, lpl, 1);
            }

            /*
             * Make the thread switch to the bound CPU.
             * If the thread is runnable, we need to
             * requeue it even if t_cpu is already set
             * to the right CPU, since it may be on a
             * kpreempt queue and need to move to a local
             * queue.  We could check t_disp_queue to
             * avoid unnecessary overhead if it's already
             * on the right queue, but since this isn't
             * a performance-critical operation it doesn't
             * seem worth the extra code and complexity.
             *
             * If the thread is weakbound to the cpu then it will
             * resist the new binding request until the weak
             * binding drops.  The cpu_surrender or requeueing
             * below could be skipped in such cases (since it
             * will have no effect), but that would require
             * thread_allowmigrate to acquire thread_lock so
             * we'll take the very occasional hit here instead.
             */
            if (tp->t_state == TS_ONPROC) {
                cpu_surrender(tp);
            } else if (tp->t_state == TS_RUN) {
                cpu_t *ocp = tp->t_cpu;

                (void) dispdeq(tp);
                setbackdq(tp);
                /*
                 * Either on the bound CPU's disp queue now,
                 * or swapped out or on the swap queue.
                 */
                ASSERT(tp->t_disp_queue == cp->cpu_disp ||
                    tp->t_weakbound_cpu == ocp ||
                    (tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ))
                    != TS_LOAD);
            }
        }
    }

    /*
     * Our binding has changed; set TP_CHANGEBIND.
     */
    tp->t_proc_flag |= TP_CHANGEBIND;
    aston(tp);

    thread_unlock(tp);

    return (0);
}

#if CPUSET_WORDS > 1

/*
 * Functions for implementing cpuset operations when a cpuset is more
 * than one word.  On platforms where a cpuset is a single word these
 * are implemented as macros in cpuvar.h.
 */

void
cpuset_all(cpuset_t *s)
{
    int i;

    for (i = 0; i < CPUSET_WORDS; i++)
        s->cpub[i] = ~0UL;
}

void
cpuset_all_but(cpuset_t *s, uint_t cpu)
{
    cpuset_all(s);
    CPUSET_DEL(*s, cpu);
}

void
cpuset_only(cpuset_t *s, uint_t cpu)
{
    CPUSET_ZERO(*s);
    CPUSET_ADD(*s, cpu);
}

int
cpuset_isnull(cpuset_t *s)
{
    int i;

    for (i = 0; i < CPUSET_WORDS; i++)
        if (s->cpub[i] != 0)
            return (0);
    return (1);
}

int
cpuset_cmp(cpuset_t *s1, cpuset_t *s2)
{
    int i;

    for (i = 0; i < CPUSET_WORDS; i++)
        if (s1->cpub[i] != s2->cpub[i])
            return (0);
    return (1);
}

uint_t
cpuset_find(cpuset_t *s)
{

    uint_t  i;
    uint_t  cpu = (uint_t)-1;

    /*
     * Find a cpu in the cpuset
     */
    for (i = 0; i < CPUSET_WORDS; i++) {
        cpu = (uint_t)(lowbit(s->cpub[i]) - 1);
        if (cpu != (uint_t)-1) {
            cpu += i * BT_NBIPUL;
            break;
        }
    }
    return (cpu);
}

void
cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)
{
    int i, j;
    uint_t  bit;

    /*
     * First, find the smallest cpu id in the set.
     */
    for (i = 0; i < CPUSET_WORDS; i++) {
        if (s->cpub[i] != 0) {
            bit = (uint_t)(lowbit(s->cpub[i]) - 1);
            ASSERT(bit != (uint_t)-1);
            *smallestid = bit + (i * BT_NBIPUL);

            /*
             * Now find the largest cpu id in
             * the set and return immediately.
             * Done in an inner loop to avoid
             * having to break out of the first
             * loop.
             */
            for (j = CPUSET_WORDS - 1; j >= i; j--) {
                if (s->cpub[j] != 0) {
                    bit = (uint_t)(highbit(s->cpub[j]) - 1);
                    ASSERT(bit != (uint_t)-1);
                    *largestid = bit + (j * BT_NBIPUL);
                    ASSERT(*largestid >= *smallestid);
                    return;
                }
            }

            /*
             * If this code is reached, a
             * smallestid was found, but not a
             * largestid. The cpuset must have
             * been changed during the course
             * of this function call.
             */
            ASSERT(0);
        }
    }
    *smallestid = *largestid = CPUSET_NOTINSET;
}

#endif  /* CPUSET_WORDS */

/*
 * Unbind all user threads bound to a given CPU.
 */
int
cpu_unbind(processorid_t cpu)
{
    processorid_t obind;
    kthread_t *tp;
    int ret = 0;
    proc_t *pp;
    int err, berr = 0;

    ASSERT(MUTEX_HELD(&cpu_lock));

    mutex_enter(&pidlock);
    for (pp = practive; pp != NULL; pp = pp->p_next) {
        mutex_enter(&pp->p_lock);
        tp = pp->p_tlist;
        /*
         * Skip zombies, kernel processes, and processes in
         * other zones, if called from a non-global zone.
         */
        if (tp == NULL || (pp->p_flag & SSYS) ||
            !HASZONEACCESS(curproc, pp->p_zone->zone_id)) {
            mutex_exit(&pp->p_lock);
            continue;
        }
        do {
            if (tp->t_bind_cpu != cpu)
                continue;
            err = cpu_bind_thread(tp, PBIND_NONE, &obind, &berr);
            if (ret == 0)
                ret = err;
        } while ((tp = tp->t_forw) != pp->p_tlist);
        mutex_exit(&pp->p_lock);
    }
    mutex_exit(&pidlock);
    if (ret == 0)
        ret = berr;
    return (ret);
}


/*
 * Destroy all remaining bound threads on a cpu.
 */
void
cpu_destroy_bound_threads(cpu_t *cp)
{
    extern id_t syscid;
    register kthread_id_t   t, tlist, tnext;

    /*
     * Destroy all remaining bound threads on the cpu.  This
     * should include both the interrupt threads and the idle thread.
     * This requires some care, since we need to traverse the
     * thread list with the pidlock mutex locked, but thread_free
     * also locks the pidlock mutex.  So, we collect the threads
     * we're going to reap in a list headed by "tlist", then we
     * unlock the pidlock mutex and traverse the tlist list,
     * doing thread_free's on the thread's.  Simple, n'est pas?
     * Also, this depends on thread_free not mucking with the
     * t_next and t_prev links of the thread.
     */

    if ((t = curthread) != NULL) {

        tlist = NULL;
        mutex_enter(&pidlock);
        do {
            tnext = t->t_next;
            if (t->t_bound_cpu == cp) {

                /*
                 * We've found a bound thread, carefully unlink
                 * it out of the thread list, and add it to
                 * our "tlist".  We "know" we don't have to
                 * worry about unlinking curthread (the thread
                 * that is executing this code).
                 */
                t->t_next->t_prev = t->t_prev;
                t->t_prev->t_next = t->t_next;
                t->t_next = tlist;
                tlist = t;
                ASSERT(t->t_cid == syscid);
                /* wake up anyone blocked in thread_join */
                cv_broadcast(&t->t_joincv);
                /*
                 * t_lwp set by interrupt threads and not
                 * cleared.
                 */
                t->t_lwp = NULL;
                /*
                 * Pause and idle threads always have
                 * t_state set to TS_ONPROC.
                 */
                t->t_state = TS_FREE;
                t->t_prev = NULL;   /* Just in case */
            }

        } while ((t = tnext) != curthread);

        mutex_exit(&pidlock);


        for (t = tlist; t != NULL; t = tnext) {
            tnext = t->t_next;
            thread_free(t);
        }
    }
}

/*
 * processor_info(2) and p_online(2) status support functions
 *   The constants returned by the cpu_get_state() and cpu_get_state_str() are
 *   for use in communicating processor state information to userland.  Kernel
 *   subsystems should only be using the cpu_flags value directly.  Subsystems
 *   modifying cpu_flags should record the state change via a call to the
 *   cpu_set_state().
 */

/*
 * Update the pi_state of this CPU.  This function provides the CPU status for
 * the information returned by processor_info(2).
 */
void
cpu_set_state(cpu_t *cpu)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    cpu->cpu_type_info.pi_state = cpu_get_state(cpu);
    cpu->cpu_state_begin = gethrestime_sec();
    pool_cpu_mod = gethrtime();
}

/*
 * Return offline/online/other status for the indicated CPU.  Use only for
 * communication with user applications; cpu_flags provides the in-kernel
 * interface.
 */
int
cpu_get_state(cpu_t *cpu)
{
    ASSERT(MUTEX_HELD(&cpu_lock));
    if (cpu->cpu_flags & CPU_POWEROFF)
        return (P_POWEROFF);
    else if (cpu->cpu_flags & CPU_FAULTED)
        return (P_FAULTED);
    else if (cpu->cpu_flags & CPU_SPARE)
        return (P_SPARE);
    else if ((cpu->cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY)
        return (P_OFFLINE);
    else if (cpu->cpu_flags & CPU_ENABLE)
        return (P_ONLINE);
    else
        return (P_NOINTR);
}

/*
 * Return processor_info(2) state as a string.
 */
const char *
cpu_get_state_str(cpu_t *cpu)
{
    const char *string;

    switch (cpu_get_state(cpu)) {
    case P_ONLINE:
        string = PS_ONLINE;
        break;
    case P_POWEROFF:
        string = PS_POWEROFF;
        break;
    case P_NOINTR:
        string = PS_NOINTR;
        break;
    case P_SPARE:
        string = PS_SPARE;
        break;
    case P_FAULTED:
        string = PS_FAULTED;
        break;
    case P_OFFLINE:
        string = PS_OFFLINE;
        break;
    default:
        string = "unknown";
        break;
    }
    return (string);
}

/*
 * Export this CPU's statistics (cpu_stat_t and cpu_stats_t) as raw and named
 * kstats, respectively.  This is done when a CPU is initialized or placed
 * online via p_online(2).
 */
static void
cpu_stats_kstat_create(cpu_t *cp)
{
    int     instance = cp->cpu_id;
    char    *module = "cpu";
    char    *class = "misc";
    kstat_t *ksp;
    zoneid_t zoneid;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (pool_pset_enabled())
        zoneid = GLOBAL_ZONEID;
    else
        zoneid = ALL_ZONES;
    /*
     * Create named kstats
     */
#define CPU_STATS_KS_CREATE(name, tsize, update_func)                    \
    ksp = kstat_create_zone(module, instance, (name), class,         \
        KSTAT_TYPE_NAMED, (tsize) / sizeof (kstat_named_t), 0,       \
        zoneid);                                                     \
    if (ksp != NULL) {                                               \
        ksp->ks_private = cp;                                    \
        ksp->ks_update = (update_func);                          \
        kstat_install(ksp);                                      \
    } else                                                           \
        cmn_err(CE_WARN, "cpu: unable to create %s:%d:%s kstat", \
            module, instance, (name));

    CPU_STATS_KS_CREATE("sys", sizeof (cpu_sys_stats_ks_data_template),
        cpu_sys_stats_ks_update);
    CPU_STATS_KS_CREATE("vm", sizeof (cpu_vm_stats_ks_data_template),
        cpu_vm_stats_ks_update);

    /*
     * Export the familiar cpu_stat_t KSTAT_TYPE_RAW kstat.
     */
    ksp = kstat_create_zone("cpu_stat", cp->cpu_id, NULL,
        "misc", KSTAT_TYPE_RAW, sizeof (cpu_stat_t), 0, zoneid);
    if (ksp != NULL) {
        ksp->ks_update = cpu_stat_ks_update;
        ksp->ks_private = cp;
        kstat_install(ksp);
    }
}

static void
cpu_stats_kstat_destroy(cpu_t *cp)
{
    char ks_name[KSTAT_STRLEN];

    (void) sprintf(ks_name, "cpu_stat%d", cp->cpu_id);
    kstat_delete_byname("cpu_stat", cp->cpu_id, ks_name);

    kstat_delete_byname("cpu", cp->cpu_id, "sys");
    kstat_delete_byname("cpu", cp->cpu_id, "vm");
}

static int
cpu_sys_stats_ks_update(kstat_t *ksp, int rw)
{
    cpu_t *cp = (cpu_t *)ksp->ks_private;
    struct cpu_sys_stats_ks_data *csskd;
    cpu_sys_stats_t *css;
    hrtime_t msnsecs[NCMSTATES];
    int i;

    if (rw == KSTAT_WRITE)
        return (EACCES);

    csskd = ksp->ks_data;
    css = &cp->cpu_stats.sys;

    /*
     * Read CPU mstate, but compare with the last values we
     * received to make sure that the returned kstats never
     * decrease.
     */

    get_cpu_mstate(cp, msnsecs);
    if (csskd->cpu_nsec_idle.value.ui64 > msnsecs[CMS_IDLE])
        msnsecs[CMS_IDLE] = csskd->cpu_nsec_idle.value.ui64;
    if (csskd->cpu_nsec_user.value.ui64 > msnsecs[CMS_USER])
        msnsecs[CMS_USER] = csskd->cpu_nsec_user.value.ui64;
    if (csskd->cpu_nsec_kernel.value.ui64 > msnsecs[CMS_SYSTEM])
        msnsecs[CMS_SYSTEM] = csskd->cpu_nsec_kernel.value.ui64;

    bcopy(&cpu_sys_stats_ks_data_template, ksp->ks_data,
        sizeof (cpu_sys_stats_ks_data_template));

    csskd->cpu_ticks_wait.value.ui64 = 0;
    csskd->wait_ticks_io.value.ui64 = 0;

    csskd->cpu_nsec_idle.value.ui64 = msnsecs[CMS_IDLE];
    csskd->cpu_nsec_user.value.ui64 = msnsecs[CMS_USER];
    csskd->cpu_nsec_kernel.value.ui64 = msnsecs[CMS_SYSTEM];
    csskd->cpu_ticks_idle.value.ui64 =
        NSEC_TO_TICK(csskd->cpu_nsec_idle.value.ui64);
    csskd->cpu_ticks_user.value.ui64 =
        NSEC_TO_TICK(csskd->cpu_nsec_user.value.ui64);
    csskd->cpu_ticks_kernel.value.ui64 =
        NSEC_TO_TICK(csskd->cpu_nsec_kernel.value.ui64);
    csskd->bread.value.ui64 = css->bread;
    csskd->bwrite.value.ui64 = css->bwrite;
    csskd->lread.value.ui64 = css->lread;
    csskd->lwrite.value.ui64 = css->lwrite;
    csskd->phread.value.ui64 = css->phread;
    csskd->phwrite.value.ui64 = css->phwrite;
    csskd->pswitch.value.ui64 = css->pswitch;
    csskd->trap.value.ui64 = css->trap;
    csskd->intr.value.ui64 = 0;
    for (i = 0; i < PIL_MAX; i++)
        csskd->intr.value.ui64 += css->intr[i];
    csskd->syscall.value.ui64 = css->syscall;
    csskd->sysread.value.ui64 = css->sysread;
    csskd->syswrite.value.ui64 = css->syswrite;
    csskd->sysfork.value.ui64 = css->sysfork;
    csskd->sysvfork.value.ui64 = css->sysvfork;
    csskd->sysexec.value.ui64 = css->sysexec;
    csskd->readch.value.ui64 = css->readch;
    csskd->writech.value.ui64 = css->writech;
    csskd->rcvint.value.ui64 = css->rcvint;
    csskd->xmtint.value.ui64 = css->xmtint;
    csskd->mdmint.value.ui64 = css->mdmint;
    csskd->rawch.value.ui64 = css->rawch;
    csskd->canch.value.ui64 = css->canch;
    csskd->outch.value.ui64 = css->outch;
    csskd->msg.value.ui64 = css->msg;
    csskd->sema.value.ui64 = css->sema;
    csskd->namei.value.ui64 = css->namei;
    csskd->ufsiget.value.ui64 = css->ufsiget;
    csskd->ufsdirblk.value.ui64 = css->ufsdirblk;
    csskd->ufsipage.value.ui64 = css->ufsipage;
    csskd->ufsinopage.value.ui64 = css->ufsinopage;
    csskd->procovf.value.ui64 = css->procovf;
    csskd->intrthread.value.ui64 = 0;
    for (i = 0; i < LOCK_LEVEL; i++)
        csskd->intrthread.value.ui64 += css->intr[i];
    csskd->intrblk.value.ui64 = css->intrblk;
    csskd->intrunpin.value.ui64 = css->intrunpin;
    csskd->idlethread.value.ui64 = css->idlethread;
    csskd->inv_swtch.value.ui64 = css->inv_swtch;
    csskd->nthreads.value.ui64 = css->nthreads;
    csskd->cpumigrate.value.ui64 = css->cpumigrate;
    csskd->xcalls.value.ui64 = css->xcalls;
    csskd->mutex_adenters.value.ui64 = css->mutex_adenters;
    csskd->rw_rdfails.value.ui64 = css->rw_rdfails;
    csskd->rw_wrfails.value.ui64 = css->rw_wrfails;
    csskd->modload.value.ui64 = css->modload;
    csskd->modunload.value.ui64 = css->modunload;
    csskd->bawrite.value.ui64 = css->bawrite;
    csskd->iowait.value.ui64 = 0;

    return (0);
}

static int
cpu_vm_stats_ks_update(kstat_t *ksp, int rw)
{
    cpu_t *cp = (cpu_t *)ksp->ks_private;
    struct cpu_vm_stats_ks_data *cvskd;
    cpu_vm_stats_t *cvs;

    if (rw == KSTAT_WRITE)
        return (EACCES);

    cvs = &cp->cpu_stats.vm;
    cvskd = ksp->ks_data;

    bcopy(&cpu_vm_stats_ks_data_template, ksp->ks_data,
        sizeof (cpu_vm_stats_ks_data_template));
    cvskd->pgrec.value.ui64 = cvs->pgrec;
    cvskd->pgfrec.value.ui64 = cvs->pgfrec;
    cvskd->pgin.value.ui64 = cvs->pgin;
    cvskd->pgpgin.value.ui64 = cvs->pgpgin;
    cvskd->pgout.value.ui64 = cvs->pgout;
    cvskd->pgpgout.value.ui64 = cvs->pgpgout;
    cvskd->swapin.value.ui64 = cvs->swapin;
    cvskd->pgswapin.value.ui64 = cvs->pgswapin;
    cvskd->swapout.value.ui64 = cvs->swapout;
    cvskd->pgswapout.value.ui64 = cvs->pgswapout;
    cvskd->zfod.value.ui64 = cvs->zfod;
    cvskd->dfree.value.ui64 = cvs->dfree;
    cvskd->scan.value.ui64 = cvs->scan;
    cvskd->rev.value.ui64 = cvs->rev;
    cvskd->hat_fault.value.ui64 = cvs->hat_fault;
    cvskd->as_fault.value.ui64 = cvs->as_fault;
    cvskd->maj_fault.value.ui64 = cvs->maj_fault;
    cvskd->cow_fault.value.ui64 = cvs->cow_fault;
    cvskd->prot_fault.value.ui64 = cvs->prot_fault;
    cvskd->softlock.value.ui64 = cvs->softlock;
    cvskd->kernel_asflt.value.ui64 = cvs->kernel_asflt;
    cvskd->pgrrun.value.ui64 = cvs->pgrrun;
    cvskd->execpgin.value.ui64 = cvs->execpgin;
    cvskd->execpgout.value.ui64 = cvs->execpgout;
    cvskd->execfree.value.ui64 = cvs->execfree;
    cvskd->anonpgin.value.ui64 = cvs->anonpgin;
    cvskd->anonpgout.value.ui64 = cvs->anonpgout;
    cvskd->anonfree.value.ui64 = cvs->anonfree;
    cvskd->fspgin.value.ui64 = cvs->fspgin;
    cvskd->fspgout.value.ui64 = cvs->fspgout;
    cvskd->fsfree.value.ui64 = cvs->fsfree;

    return (0);
}

static int
cpu_stat_ks_update(kstat_t *ksp, int rw)
{
    cpu_stat_t *cso;
    cpu_t *cp;
    int i;
    hrtime_t msnsecs[NCMSTATES];

    cso = (cpu_stat_t *)ksp->ks_data;
    cp = (cpu_t *)ksp->ks_private;

    if (rw == KSTAT_WRITE)
        return (EACCES);

    /*
     * Read CPU mstate, but compare with the last values we
     * received to make sure that the returned kstats never
     * decrease.
     */

    get_cpu_mstate(cp, msnsecs);
    msnsecs[CMS_IDLE] = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
    msnsecs[CMS_USER] = NSEC_TO_TICK(msnsecs[CMS_USER]);
    msnsecs[CMS_SYSTEM] = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
    if (cso->cpu_sysinfo.cpu[CPU_IDLE] < msnsecs[CMS_IDLE])
        cso->cpu_sysinfo.cpu[CPU_IDLE] = msnsecs[CMS_IDLE];
    if (cso->cpu_sysinfo.cpu[CPU_USER] < msnsecs[CMS_USER])
        cso->cpu_sysinfo.cpu[CPU_USER] = msnsecs[CMS_USER];
    if (cso->cpu_sysinfo.cpu[CPU_KERNEL] < msnsecs[CMS_SYSTEM])
        cso->cpu_sysinfo.cpu[CPU_KERNEL] = msnsecs[CMS_SYSTEM];
    cso->cpu_sysinfo.cpu[CPU_WAIT]  = 0;
    cso->cpu_sysinfo.wait[W_IO]     = 0;
    cso->cpu_sysinfo.wait[W_SWAP]   = 0;
    cso->cpu_sysinfo.wait[W_PIO]    = 0;
    cso->cpu_sysinfo.bread      = CPU_STATS(cp, sys.bread);
    cso->cpu_sysinfo.bwrite     = CPU_STATS(cp, sys.bwrite);
    cso->cpu_sysinfo.lread      = CPU_STATS(cp, sys.lread);
    cso->cpu_sysinfo.lwrite     = CPU_STATS(cp, sys.lwrite);
    cso->cpu_sysinfo.phread     = CPU_STATS(cp, sys.phread);
    cso->cpu_sysinfo.phwrite    = CPU_STATS(cp, sys.phwrite);
    cso->cpu_sysinfo.pswitch    = CPU_STATS(cp, sys.pswitch);
    cso->cpu_sysinfo.trap       = CPU_STATS(cp, sys.trap);
    cso->cpu_sysinfo.intr       = 0;
    for (i = 0; i < PIL_MAX; i++)
        cso->cpu_sysinfo.intr += CPU_STATS(cp, sys.intr[i]);
    cso->cpu_sysinfo.syscall    = CPU_STATS(cp, sys.syscall);
    cso->cpu_sysinfo.sysread    = CPU_STATS(cp, sys.sysread);
    cso->cpu_sysinfo.syswrite   = CPU_STATS(cp, sys.syswrite);
    cso->cpu_sysinfo.sysfork    = CPU_STATS(cp, sys.sysfork);
    cso->cpu_sysinfo.sysvfork   = CPU_STATS(cp, sys.sysvfork);
    cso->cpu_sysinfo.sysexec    = CPU_STATS(cp, sys.sysexec);
    cso->cpu_sysinfo.readch     = CPU_STATS(cp, sys.readch);
    cso->cpu_sysinfo.writech    = CPU_STATS(cp, sys.writech);
    cso->cpu_sysinfo.rcvint     = CPU_STATS(cp, sys.rcvint);
    cso->cpu_sysinfo.xmtint     = CPU_STATS(cp, sys.xmtint);
    cso->cpu_sysinfo.mdmint     = CPU_STATS(cp, sys.mdmint);
    cso->cpu_sysinfo.rawch      = CPU_STATS(cp, sys.rawch);
    cso->cpu_sysinfo.canch      = CPU_STATS(cp, sys.canch);
    cso->cpu_sysinfo.outch      = CPU_STATS(cp, sys.outch);
    cso->cpu_sysinfo.msg        = CPU_STATS(cp, sys.msg);
    cso->cpu_sysinfo.sema       = CPU_STATS(cp, sys.sema);
    cso->cpu_sysinfo.namei      = CPU_STATS(cp, sys.namei);
    cso->cpu_sysinfo.ufsiget    = CPU_STATS(cp, sys.ufsiget);
    cso->cpu_sysinfo.ufsdirblk  = CPU_STATS(cp, sys.ufsdirblk);
    cso->cpu_sysinfo.ufsipage   = CPU_STATS(cp, sys.ufsipage);
    cso->cpu_sysinfo.ufsinopage = CPU_STATS(cp, sys.ufsinopage);
    cso->cpu_sysinfo.inodeovf   = 0;
    cso->cpu_sysinfo.fileovf    = 0;
    cso->cpu_sysinfo.procovf    = CPU_STATS(cp, sys.procovf);
    cso->cpu_sysinfo.intrthread = 0;
    for (i = 0; i < LOCK_LEVEL; i++)
        cso->cpu_sysinfo.intrthread += CPU_STATS(cp, sys.intr[i]);
    cso->cpu_sysinfo.intrblk    = CPU_STATS(cp, sys.intrblk);
    cso->cpu_sysinfo.idlethread = CPU_STATS(cp, sys.idlethread);
    cso->cpu_sysinfo.inv_swtch  = CPU_STATS(cp, sys.inv_swtch);
    cso->cpu_sysinfo.nthreads   = CPU_STATS(cp, sys.nthreads);
    cso->cpu_sysinfo.cpumigrate = CPU_STATS(cp, sys.cpumigrate);
    cso->cpu_sysinfo.xcalls     = CPU_STATS(cp, sys.xcalls);
    cso->cpu_sysinfo.mutex_adenters = CPU_STATS(cp, sys.mutex_adenters);
    cso->cpu_sysinfo.rw_rdfails = CPU_STATS(cp, sys.rw_rdfails);
    cso->cpu_sysinfo.rw_wrfails = CPU_STATS(cp, sys.rw_wrfails);
    cso->cpu_sysinfo.modload    = CPU_STATS(cp, sys.modload);
    cso->cpu_sysinfo.modunload  = CPU_STATS(cp, sys.modunload);
    cso->cpu_sysinfo.bawrite    = CPU_STATS(cp, sys.bawrite);
    cso->cpu_sysinfo.rw_enters  = 0;
    cso->cpu_sysinfo.win_uo_cnt = 0;
    cso->cpu_sysinfo.win_uu_cnt = 0;
    cso->cpu_sysinfo.win_so_cnt = 0;
    cso->cpu_sysinfo.win_su_cnt = 0;
    cso->cpu_sysinfo.win_suo_cnt    = 0;

    cso->cpu_syswait.iowait     = 0;
    cso->cpu_syswait.swap       = 0;
    cso->cpu_syswait.physio     = 0;

    cso->cpu_vminfo.pgrec       = CPU_STATS(cp, vm.pgrec);
    cso->cpu_vminfo.pgfrec      = CPU_STATS(cp, vm.pgfrec);
    cso->cpu_vminfo.pgin        = CPU_STATS(cp, vm.pgin);
    cso->cpu_vminfo.pgpgin      = CPU_STATS(cp, vm.pgpgin);
    cso->cpu_vminfo.pgout       = CPU_STATS(cp, vm.pgout);
    cso->cpu_vminfo.pgpgout     = CPU_STATS(cp, vm.pgpgout);
    cso->cpu_vminfo.swapin      = CPU_STATS(cp, vm.swapin);
    cso->cpu_vminfo.pgswapin    = CPU_STATS(cp, vm.pgswapin);
    cso->cpu_vminfo.swapout     = CPU_STATS(cp, vm.swapout);
    cso->cpu_vminfo.pgswapout   = CPU_STATS(cp, vm.pgswapout);
    cso->cpu_vminfo.zfod        = CPU_STATS(cp, vm.zfod);
    cso->cpu_vminfo.dfree       = CPU_STATS(cp, vm.dfree);
    cso->cpu_vminfo.scan        = CPU_STATS(cp, vm.scan);
    cso->cpu_vminfo.rev     = CPU_STATS(cp, vm.rev);
    cso->cpu_vminfo.hat_fault   = CPU_STATS(cp, vm.hat_fault);
    cso->cpu_vminfo.as_fault    = CPU_STATS(cp, vm.as_fault);
    cso->cpu_vminfo.maj_fault   = CPU_STATS(cp, vm.maj_fault);
    cso->cpu_vminfo.cow_fault   = CPU_STATS(cp, vm.cow_fault);
    cso->cpu_vminfo.prot_fault  = CPU_STATS(cp, vm.prot_fault);
    cso->cpu_vminfo.softlock    = CPU_STATS(cp, vm.softlock);
    cso->cpu_vminfo.kernel_asflt    = CPU_STATS(cp, vm.kernel_asflt);
    cso->cpu_vminfo.pgrrun      = CPU_STATS(cp, vm.pgrrun);
    cso->cpu_vminfo.execpgin    = CPU_STATS(cp, vm.execpgin);
    cso->cpu_vminfo.execpgout   = CPU_STATS(cp, vm.execpgout);
    cso->cpu_vminfo.execfree    = CPU_STATS(cp, vm.execfree);
    cso->cpu_vminfo.anonpgin    = CPU_STATS(cp, vm.anonpgin);
    cso->cpu_vminfo.anonpgout   = CPU_STATS(cp, vm.anonpgout);
    cso->cpu_vminfo.anonfree    = CPU_STATS(cp, vm.anonfree);
    cso->cpu_vminfo.fspgin      = CPU_STATS(cp, vm.fspgin);
    cso->cpu_vminfo.fspgout     = CPU_STATS(cp, vm.fspgout);
    cso->cpu_vminfo.fsfree      = CPU_STATS(cp, vm.fsfree);

    return (0);
}