common/disp/cmt.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/processor.h>
#include <sys/disp.h>
#include <sys/group.h>
#include <sys/pghw.h>
#include <sys/bitset.h>
#include <sys/lgrp.h>
#include <sys/cmt.h>
#include <sys/cpu_pm.h>

/*
 * CMT scheduler / dispatcher support
 *
 * This file implements CMT scheduler support using Processor Groups.
 * The CMT processor group class creates and maintains the CMT class
 * specific processor group pg_cmt_t.
 *
 * ---------------------------- <-- pg_cmt_t *
 * | pghw_t                   |
 * ----------------------------
 * | CMT class specific data  |
 * | - hierarchy linkage      |
 * | - CMT load balancing data|
 * | - active CPU group/bitset|
 * ----------------------------
 *
 * The scheduler/dispatcher leverages knowledge of the performance
 * relevant CMT sharing relationships existing between cpus to implement
 * optimized affinity, load balancing, and coalescence policies.
 *
 * Load balancing policy seeks to improve performance by minimizing
 * contention over shared processor resources / facilities, Affinity
 * policies seek to improve cache and TLB utilization. Coalescence
 * policies improve resource utilization and ultimately power efficiency.
 *
 * The CMT PGs created by this class are already arranged into a
 * hierarchy (which is done in the pghw layer). To implement the top-down
 * CMT load balancing algorithm, the CMT PGs additionally maintain
 * parent, child and sibling hierarchy relationships.
 * Parent PGs always contain a superset of their children(s) resources,
 * each PG can have at most one parent, and siblings are the group of PGs
 * sharing the same parent.
 *
 * On UMA based systems, the CMT load balancing algorithm begins by balancing
 * load across the group of top level PGs in the system hierarchy.
 * On NUMA systems, the CMT load balancing algorithm balances load across the
 * group of top level PGs in each leaf lgroup...but for root homed threads,
 * is willing to balance against all the top level PGs in the system.
 *
 * Groups of top level PGs are maintained to implement the above, one for each
 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
 * root lgroup) that contains all the top level PGs in the system.
 */
static cmt_lgrp_t   *cmt_lgrps = NULL;  /* cmt_lgrps list head */
static cmt_lgrp_t   *cpu0_lgrp = NULL;  /* boot CPU's initial lgrp */
                        /* used for null_proc_lpa */
cmt_lgrp_t      *cmt_root = NULL;   /* Reference to root cmt pg */

static int      is_cpu0 = 1; /* true if this is boot CPU context */

/*
 * Array of hardware sharing relationships that are blacklisted.
 * CMT scheduling optimizations won't be performed for blacklisted sharing
 * relationships.
 */
static int      cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];

/*
 * Set this to non-zero to disable CMT scheduling
 * This must be done via kmdb -d, as /etc/system will be too late
 */
int         cmt_sched_disabled = 0;

/*
 * Status codes for CMT lineage validation
 * See pg_cmt_lineage_validate() below
 */
typedef enum cmt_lineage_validation {
    CMT_LINEAGE_VALID,
    CMT_LINEAGE_NON_CONCENTRIC,
    CMT_LINEAGE_PG_SPANS_LGRPS,
    CMT_LINEAGE_NON_PROMOTABLE,
    CMT_LINEAGE_REPAIRED,
    CMT_LINEAGE_UNRECOVERABLE
} cmt_lineage_validation_t;

/*
 * Status of the current lineage under construction.
 * One must be holding cpu_lock to change this.
 */
cmt_lineage_validation_t    cmt_lineage_status = CMT_LINEAGE_VALID;

/*
 * Power domain definitions (on x86) are defined by ACPI, and
 * therefore may be subject to BIOS bugs.
 */
#define PG_CMT_HW_SUSPECT(hw)   PGHW_IS_PM_DOMAIN(hw)

/*
 * Macro to test if PG is managed by the CMT PG class
 */
#define IS_CMT_PG(pg)   (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)

static pg_cid_t     pg_cmt_class_id;        /* PG class id */

static pg_t     *pg_cmt_alloc();
static void     pg_cmt_free(pg_t *);
static void     pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
static void     pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
static void     pg_cmt_cpu_active(cpu_t *);
static void     pg_cmt_cpu_inactive(cpu_t *);
static void     pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
static void     pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
static char     *pg_cmt_policy_name(pg_t *);
static void     pg_cmt_hier_sort(pg_cmt_t **, int);
static pg_cmt_t     *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
static int      pg_cmt_cpu_belongs(pg_t *, cpu_t *);
static int      pg_cmt_hw(pghw_type_t);
static cmt_lgrp_t   *pg_cmt_find_lgrp(lgrp_handle_t);
static cmt_lgrp_t   *pg_cmt_lgrp_create(lgrp_handle_t);
static void     cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
                kthread_t *, kthread_t *);
static void     cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
                kthread_t *, kthread_t *);
static void     cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *,
                cpu_pg_t *);

/*
 * CMT PG ops
 */
struct pg_ops pg_ops_cmt = {
    pg_cmt_alloc,
    pg_cmt_free,
    pg_cmt_cpu_init,
    pg_cmt_cpu_fini,
    pg_cmt_cpu_active,
    pg_cmt_cpu_inactive,
    pg_cmt_cpupart_in,
    NULL,           /* cpupart_out */
    pg_cmt_cpupart_move,
    pg_cmt_cpu_belongs,
    pg_cmt_policy_name,
};

/*
 * Initialize the CMT PG class
 */
void
pg_cmt_class_init(void)
{
    if (cmt_sched_disabled)
        return;

    pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
}

/*
 * Called to indicate a new CPU has started up so
 * that either t0 or the slave startup thread can
 * be accounted for.
 */
void
pg_cmt_cpu_startup(cpu_t *cp)
{
    pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
        cp->cpu_thread);
}

/*
 * Return non-zero if thread can migrate between "from" and "to"
 * without a performance penalty
 */
int
pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
{
    if (from->cpu_physid->cpu_cacheid ==
        to->cpu_physid->cpu_cacheid)
        return (1);
    return (0);
}

/*
 * CMT class specific PG allocation
 */
static pg_t *
pg_cmt_alloc(void)
{
    return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
}

/*
 * Class specific PG de-allocation
 */
static void
pg_cmt_free(pg_t *pg)
{
    ASSERT(pg != NULL);
    ASSERT(IS_CMT_PG(pg));

    kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
}

/*
 * Given a hardware sharing relationship, return which dispatcher
 * policies should be implemented to optimize performance and efficiency
 */
static pg_cmt_policy_t
pg_cmt_policy(pghw_type_t hw)
{
    pg_cmt_policy_t p;

    /*
     * Give the platform a chance to override the default
     */
    if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
        return (p);

    switch (hw) {
    case PGHW_IPIPE:
    case PGHW_FPU:
    case PGHW_PROCNODE:
    case PGHW_CHIP:
        return (CMT_BALANCE);
    case PGHW_CACHE:
        return (CMT_AFFINITY | CMT_BALANCE);
    case PGHW_POW_ACTIVE:
    case PGHW_POW_IDLE:
        return (CMT_BALANCE);
    default:
        return (CMT_NO_POLICY);
    }
}

/*
 * Rank the importance of optimizing for the pg1 relationship vs.
 * the pg2 relationship.
 */
static pg_cmt_t *
pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
{
    pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
    pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;

    /*
     * A power domain is only important if CPUPM is enabled.
     */
    if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
        if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
            return (pg2);
        if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
            return (pg1);
    }

    /*
     * Otherwise, ask the platform
     */
    if (pg_plat_hw_rank(hw1, hw2) == hw1)
        return (pg1);
    else
        return (pg2);
}

/*
 * Initialize CMT callbacks for the given PG
 */
static void
cmt_callback_init(pg_t *pg)
{
    /*
     * Stick with the default callbacks if there isn't going to be
     * any CMT thread placement optimizations implemented.
     */
    if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
        return;

    switch (((pghw_t *)pg)->pghw_hw) {
    case PGHW_POW_ACTIVE:
        pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
        pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
        break;
    default:
        pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;

    }
}

/*
 * Promote PG above it's current parent.
 * This is only legal if PG has an equal or greater number of CPUs than its
 * parent.
 *
 * This routine operates on the CPU specific processor group data (for the CPUs
 * in the PG being promoted), and may be invoked from a context where one CPU's
 * PG data is under construction. In this case the argument "pgdata", if not
 * NULL, is a reference to the CPU's under-construction PG data.
 */
static void
cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
{
    pg_cmt_t    *parent;
    group_t     *children;
    cpu_t       *cpu;
    group_iter_t    iter;
    pg_cpu_itr_t    cpu_iter;
    int     r;
    int     err;
    int     nchildren;

    ASSERT(MUTEX_HELD(&cpu_lock));

    parent = pg->cmt_parent;
    if (parent == NULL) {
        /*
         * Nothing to do
         */
        return;
    }

    ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));

    /*
     * We're changing around the hierarchy, which is actively traversed
     * by the dispatcher. Pause CPUS to ensure exclusivity.
     */
    pause_cpus(NULL, NULL);

    /*
     * If necessary, update the parent's sibling set, replacing parent
     * with PG.
     */
    if (parent->cmt_siblings) {
        if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
            != -1) {
            r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
            ASSERT(r != -1);
        }
    }

    /*
     * If the parent is at the top of the hierarchy, replace it's entry
     * in the root lgroup's group of top level PGs.
     */
    if (parent->cmt_parent == NULL &&
        parent->cmt_siblings != &cmt_root->cl_pgs) {
        if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
            != -1) {
            r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
            ASSERT(r != -1);
        }
    }

    /*
     * We assume (and therefore assert) that the PG being promoted is an
     * only child of it's parent. Update the parent's children set
     * replacing PG's entry with the parent (since the parent is becoming
     * the child). Then have PG and the parent swap children sets and
     * children counts.
     */
    ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
    if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
        r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
        ASSERT(r != -1);
    }

    children = pg->cmt_children;
    pg->cmt_children = parent->cmt_children;
    parent->cmt_children = children;

    nchildren = pg->cmt_nchildren;
    pg->cmt_nchildren = parent->cmt_nchildren;
    parent->cmt_nchildren = nchildren;

    /*
     * Update the sibling references for PG and it's parent
     */
    pg->cmt_siblings = parent->cmt_siblings;
    parent->cmt_siblings = pg->cmt_children;

    /*
     * Update any cached lineages in the per CPU pg data.
     */
    PG_CPU_ITR_INIT(pg, cpu_iter);
    while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
        int     idx;
        int     sz;
        pg_cmt_t    *cpu_pg;
        cpu_pg_t    *pgd;   /* CPU's PG data */

        /*
         * The CPU's whose lineage is under construction still
         * references the bootstrap CPU PG data structure.
         */
        if (pg_cpu_is_bootstrapped(cpu))
            pgd = pgdata;
        else
            pgd = cpu->cpu_pg;

        /*
         * Iterate over the CPU's PGs updating the children
         * of the PG being promoted, since they have a new parent.
         */
        group_iter_init(&iter);
        while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
            if (cpu_pg->cmt_parent == pg) {
                cpu_pg->cmt_parent = parent;
            }
        }

        /*
         * Update the CMT load balancing lineage
         */
        if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
            /*
             * Unless this is the CPU who's lineage is being
             * constructed, the PG being promoted should be
             * in the lineage.
             */
            ASSERT(pg_cpu_is_bootstrapped(cpu));
            continue;
        }

        ASSERT(idx > 0);
        ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);

        /*
         * Have the child and the parent swap places in the CPU's
         * lineage
         */
        group_remove_at(&pgd->cmt_pgs, idx);
        group_remove_at(&pgd->cmt_pgs, idx - 1);
        err = group_add_at(&pgd->cmt_pgs, parent, idx);
        ASSERT(err == 0);
        err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
        ASSERT(err == 0);

        /*
         * Ensure cmt_lineage references CPU's leaf PG.
         * Since cmt_pgs is top-down ordered, the bottom is the last
         * element.
         */
        if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
            pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
    }

    /*
     * Update the parent references for PG and it's parent
     */
    pg->cmt_parent = parent->cmt_parent;
    parent->cmt_parent = pg;

    start_cpus();
}

/*
 * CMT class callback for a new CPU entering the system
 *
 * This routine operates on the CPU specific processor group data (for the CPU
 * being initialized). The argument "pgdata" is a reference to the CPU's PG
 * data to be constructed.
 *
 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
 * calls must be careful to operate only on the "pgdata" argument, and not
 * cp->cpu_pg.
 */
static void
pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
{
    pg_cmt_t    *pg;
    group_t     *cmt_pgs;
    int     levels, level;
    pghw_type_t hw;
    pg_t        *pg_cache = NULL;
    pg_cmt_t    *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
    lgrp_handle_t   lgrp_handle;
    cmt_lgrp_t  *lgrp;
    cmt_lineage_validation_t    lineage_status;

    ASSERT(MUTEX_HELD(&cpu_lock));
    ASSERT(pg_cpu_is_bootstrapped(cp));

    if (cmt_sched_disabled)
        return;

    /*
     * A new CPU is coming into the system.
     * Interrogate the platform to see if the CPU
     * has any performance or efficiency relevant
     * sharing relationships
     */
    cmt_pgs = &pgdata->cmt_pgs;
    pgdata->cmt_lineage = NULL;

    bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
    levels = 0;
    for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {

        pg_cmt_policy_t policy;

        /*
         * We're only interested in the hw sharing relationships
         * for which we know how to optimize.
         */
        policy = pg_cmt_policy(hw);
        if (policy == CMT_NO_POLICY ||
            pg_plat_hw_shared(cp, hw) == 0)
            continue;

        /*
         * We will still create the PGs for hardware sharing
         * relationships that have been blacklisted, but won't
         * implement CMT thread placement optimizations against them.
         */
        if (cmt_hw_blacklisted[hw] == 1)
            policy = CMT_NO_POLICY;

        /*
         * Find (or create) the PG associated with
         * the hw sharing relationship in which cp
         * belongs.
         *
         * Determine if a suitable PG already
         * exists, or if one needs to be created.
         */
        pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
        if (pg == NULL) {
            /*
             * Create a new one.
             * Initialize the common...
             */
            pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);

            /* ... physical ... */
            pghw_init((pghw_t *)pg, cp, hw);

            /*
             * ... and CMT specific portions of the
             * structure.
             */
            pg->cmt_policy = policy;

            /* CMT event callbacks */
            cmt_callback_init((pg_t *)pg);

            bitset_init(&pg->cmt_cpus_actv_set);
            group_create(&pg->cmt_cpus_actv);
        } else {
            ASSERT(IS_CMT_PG(pg));
        }

        ((pghw_t *)pg)->pghw_generation++;

        /* Add the CPU to the PG */
        pg_cpu_add((pg_t *)pg, cp, pgdata);

        /*
         * Ensure capacity of the active CPU group/bitset
         */
        group_expand(&pg->cmt_cpus_actv,
            GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

        if (cp->cpu_seqid >=
            bitset_capacity(&pg->cmt_cpus_actv_set)) {
            bitset_resize(&pg->cmt_cpus_actv_set,
                cp->cpu_seqid + 1);
        }

        /*
         * Build a lineage of CMT PGs for load balancing / coalescence
         */
        if (policy & (CMT_BALANCE | CMT_COALESCE)) {
            cpu_cmt_hier[levels++] = pg;
        }

        /* Cache this for later */
        if (hw == PGHW_CACHE)
            pg_cache = (pg_t *)pg;
    }

    group_expand(cmt_pgs, levels);

    if (cmt_root == NULL)
        cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());

    /*
     * Find the lgrp that encapsulates this CPU's CMT hierarchy
     */
    lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
    if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
        lgrp = pg_cmt_lgrp_create(lgrp_handle);

    /*
     * Ascendingly sort the PGs in the lineage by number of CPUs
     */
    pg_cmt_hier_sort(cpu_cmt_hier, levels);

    /*
     * Examine the lineage and validate it.
     * This routine will also try to fix the lineage along with the
     * rest of the PG hierarchy should it detect an issue.
     *
     * If it returns anything other than VALID or REPAIRED, an
     * unrecoverable error has occurred, and we cannot proceed.
     */
    lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
    if ((lineage_status != CMT_LINEAGE_VALID) &&
        (lineage_status != CMT_LINEAGE_REPAIRED)) {
        /*
         * In the case of an unrecoverable error where CMT scheduling
         * has been disabled, assert that the under construction CPU's
         * PG data has an empty CMT load balancing lineage.
         */
        ASSERT((cmt_sched_disabled == 0) ||
            (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
        return;
    }

    /*
     * For existing PGs in the lineage, verify that the parent is
     * correct, as the generation in the lineage may have changed
     * as a result of the sorting. Start the traversal at the top
     * of the lineage, moving down.
     */
    for (level = levels - 1; level >= 0; ) {
        int reorg;

        reorg = 0;
        pg = cpu_cmt_hier[level];

        /*
         * Promote PGs at an incorrect generation into place.
         */
        while (pg->cmt_parent &&
            pg->cmt_parent != cpu_cmt_hier[level + 1]) {
            cmt_hier_promote(pg, pgdata);
            reorg++;
        }
        if (reorg > 0)
            level = levels - 1;
        else
            level--;
    }

    /*
     * For each of the PGs in the CPU's lineage:
     *  - Add an entry in the CPU sorted CMT PG group
     *    which is used for top down CMT load balancing
     *  - Tie the PG into the CMT hierarchy by connecting
     *    it to it's parent and siblings.
     */
    for (level = 0; level < levels; level++) {
        uint_t      children;
        int     err;

        pg = cpu_cmt_hier[level];
        err = group_add_at(cmt_pgs, pg, levels - level - 1);
        ASSERT(err == 0);

        if (level == 0)
            pgdata->cmt_lineage = (pg_t *)pg;

        if (pg->cmt_siblings != NULL) {
            /* Already initialized */
            ASSERT(pg->cmt_parent == NULL ||
                pg->cmt_parent == cpu_cmt_hier[level + 1]);
            ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
                ((pg->cmt_parent != NULL) &&
                pg->cmt_siblings == pg->cmt_parent->cmt_children));
            continue;
        }

        if ((level + 1) == levels) {
            pg->cmt_parent = NULL;

            pg->cmt_siblings = &lgrp->cl_pgs;
            children = ++lgrp->cl_npgs;
            if (cmt_root != lgrp)
                cmt_root->cl_npgs++;
        } else {
            pg->cmt_parent = cpu_cmt_hier[level + 1];

            /*
             * A good parent keeps track of their children.
             * The parent's children group is also the PG's
             * siblings.
             */
            if (pg->cmt_parent->cmt_children == NULL) {
                pg->cmt_parent->cmt_children =
                    kmem_zalloc(sizeof (group_t), KM_SLEEP);
                group_create(pg->cmt_parent->cmt_children);
            }
            pg->cmt_siblings = pg->cmt_parent->cmt_children;
            children = ++pg->cmt_parent->cmt_nchildren;
        }

        group_expand(pg->cmt_siblings, children);
        group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
    }

    /*
     * Cache the chip and core IDs in the cpu_t->cpu_physid structure
     * for fast lookups later.
     */
    if (cp->cpu_physid) {
        cp->cpu_physid->cpu_chipid =
            pg_plat_hw_instance_id(cp, PGHW_CHIP);
        cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);

        /*
         * If this cpu has a PG representing shared cache, then set
         * cpu_cacheid to that PG's logical id
         */
        if (pg_cache)
            cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
    }

    /* CPU0 only initialization */
    if (is_cpu0) {
        is_cpu0 = 0;
        cpu0_lgrp = lgrp;
    }

}

/*
 * Class callback when a CPU is leaving the system (deletion)
 *
 * "pgdata" is a reference to the CPU's PG data to be deconstructed.
 *
 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
 * references a "bootstrap" structure across this function's invocation.
 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
 * on the "pgdata" argument, and not cp->cpu_pg.
 */
static void
pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
{
    group_iter_t    i;
    pg_cmt_t    *pg;
    group_t     *pgs, *cmt_pgs;
    lgrp_handle_t   lgrp_handle;
    cmt_lgrp_t  *lgrp;

    if (cmt_sched_disabled)
        return;

    ASSERT(pg_cpu_is_bootstrapped(cp));

    pgs = &pgdata->pgs;
    cmt_pgs = &pgdata->cmt_pgs;

    /*
     * Find the lgroup that encapsulates this CPU's CMT hierarchy
     */
    lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);

    lgrp = pg_cmt_find_lgrp(lgrp_handle);
    if (ncpus == 1 && lgrp != cpu0_lgrp) {
        /*
         * One might wonder how we could be deconfiguring the
         * only CPU in the system.
         *
         * On Starcat systems when null_proc_lpa is detected,
         * the boot CPU (which is already configured into a leaf
         * lgroup), is moved into the root lgroup. This is done by
         * deconfiguring it from both lgroups and processor
         * groups), and then later reconfiguring it back in.  This
         * call to pg_cmt_cpu_fini() is part of that deconfiguration.
         *
         * This special case is detected by noting that the platform
         * has changed the CPU's lgrp affiliation (since it now
         * belongs in the root). In this case, use the cmt_lgrp_t
         * cached for the boot CPU, since this is what needs to be
         * torn down.
         */
        lgrp = cpu0_lgrp;
    }

    ASSERT(lgrp != NULL);

    /*
     * First, clean up anything load balancing specific for each of
     * the CPU's PGs that participated in CMT load balancing
     */
    pg = (pg_cmt_t *)pgdata->cmt_lineage;
    while (pg != NULL) {

        ((pghw_t *)pg)->pghw_generation++;

        /*
         * Remove the PG from the CPU's load balancing lineage
         */
        (void) group_remove(cmt_pgs, pg, GRP_RESIZE);

        /*
         * If it's about to become empty, destroy it's children
         * group, and remove it's reference from it's siblings.
         * This is done here (rather than below) to avoid removing
         * our reference from a PG that we just eliminated.
         */
        if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
            if (pg->cmt_children != NULL)
                group_destroy(pg->cmt_children);
            if (pg->cmt_siblings != NULL) {
                if (pg->cmt_siblings == &lgrp->cl_pgs)
                    lgrp->cl_npgs--;
                else
                    pg->cmt_parent->cmt_nchildren--;
            }
        }
        pg = pg->cmt_parent;
    }
    ASSERT(GROUP_SIZE(cmt_pgs) == 0);

    /*
     * Now that the load balancing lineage updates have happened,
     * remove the CPU from all it's PGs (destroying any that become
     * empty).
     */
    group_iter_init(&i);
    while ((pg = group_iterate(pgs, &i)) != NULL) {
        if (IS_CMT_PG(pg) == 0)
            continue;

        pg_cpu_delete((pg_t *)pg, cp, pgdata);
        /*
         * Deleting the CPU from the PG changes the CPU's
         * PG group over which we are actively iterating
         * Re-initialize the iteration
         */
        group_iter_init(&i);

        if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {

            /*
             * The PG has become zero sized, so destroy it.
             */
            group_destroy(&pg->cmt_cpus_actv);
            bitset_fini(&pg->cmt_cpus_actv_set);
            pghw_fini((pghw_t *)pg);

            pg_destroy((pg_t *)pg);
        }
    }
}

/*
 * Class callback when a CPU is entering a cpu partition
 */
static void
pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
{
    group_t     *pgs;
    pg_t        *pg;
    group_iter_t    i;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cmt_sched_disabled)
        return;

    pgs = &cp->cpu_pg->pgs;

    /*
     * Ensure that the new partition's PG bitset
     * is large enough for all CMT PG's to which cp
     * belongs
     */
    group_iter_init(&i);
    while ((pg = group_iterate(pgs, &i)) != NULL) {
        if (IS_CMT_PG(pg) == 0)
            continue;

        if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
            bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
    }
}

/*
 * Class callback when a CPU is actually moving partitions
 */
static void
pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
{
    cpu_t       *cpp;
    group_t     *pgs;
    pg_t        *pg;
    group_iter_t    pg_iter;
    pg_cpu_itr_t    cpu_iter;
    boolean_t   found;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cmt_sched_disabled)
        return;

    pgs = &cp->cpu_pg->pgs;
    group_iter_init(&pg_iter);

    /*
     * Iterate over the CPUs CMT PGs
     */
    while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {

        if (IS_CMT_PG(pg) == 0)
            continue;

        /*
         * Add the PG to the bitset in the new partition.
         */
        bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);

        /*
         * Remove the PG from the bitset in the old partition
         * if the last of the PG's CPUs have left.
         */
        found = B_FALSE;
        PG_CPU_ITR_INIT(pg, cpu_iter);
        while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
            if (cpp == cp)
                continue;
            if (CPU_ACTIVE(cpp) &&
                cpp->cpu_part->cp_id == oldpp->cp_id) {
                found = B_TRUE;
                break;
            }
        }
        if (!found)
            bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
    }
}

/*
 * Class callback when a CPU becomes active (online)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_active(cpu_t *cp)
{
    int     err;
    group_iter_t    i;
    pg_cmt_t    *pg;
    group_t     *pgs;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cmt_sched_disabled)
        return;

    pgs = &cp->cpu_pg->pgs;
    group_iter_init(&i);

    /*
     * Iterate over the CPU's PGs
     */
    while ((pg = group_iterate(pgs, &i)) != NULL) {

        if (IS_CMT_PG(pg) == 0)
            continue;

        /*
         * Move to the next generation since topology is changing
         */
        ((pghw_t *)pg)->pghw_generation++;

        err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
        ASSERT(err == 0);

        /*
         * If this is the first active CPU in the PG, and it
         * represents a hardware sharing relationship over which
         * CMT load balancing is performed, add it as a candidate
         * for balancing with it's siblings.
         */
        if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
            (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
            err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
            ASSERT(err == 0);

            /*
             * If this is a top level PG, add it as a balancing
             * candidate when balancing within the root lgroup.
             */
            if (pg->cmt_parent == NULL &&
                pg->cmt_siblings != &cmt_root->cl_pgs) {
                err = group_add(&cmt_root->cl_pgs, pg,
                    GRP_NORESIZE);
                ASSERT(err == 0);
            }
        }

        /*
         * Notate the CPU in the PGs active CPU bitset.
         * Also notate the PG as being active in it's associated
         * partition
         */
        bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
        bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
    }
}

/*
 * Class callback when a CPU goes inactive (offline)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_inactive(cpu_t *cp)
{
    int     err;
    group_t     *pgs;
    pg_cmt_t    *pg;
    cpu_t       *cpp;
    group_iter_t    i;
    pg_cpu_itr_t    cpu_itr;
    boolean_t   found;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cmt_sched_disabled)
        return;

    pgs = &cp->cpu_pg->pgs;
    group_iter_init(&i);

    while ((pg = group_iterate(pgs, &i)) != NULL) {

        if (IS_CMT_PG(pg) == 0)
            continue;

        /*
         * Move to the next generation since topology is changing
         */
        ((pghw_t *)pg)->pghw_generation++;

        /*
         * Remove the CPU from the CMT PGs active CPU group
         * bitmap
         */
        err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
        ASSERT(err == 0);

        bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);

        /*
         * If there are no more active CPUs in this PG over which
         * load was balanced, remove it as a balancing candidate.
         */
        if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
            (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
            err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
            ASSERT(err == 0);

            if (pg->cmt_parent == NULL &&
                pg->cmt_siblings != &cmt_root->cl_pgs) {
                err = group_remove(&cmt_root->cl_pgs, pg,
                    GRP_NORESIZE);
                ASSERT(err == 0);
            }
        }

        /*
         * Assert the number of active CPUs does not exceed
         * the total number of CPUs in the PG
         */
        ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
            GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

        /*
         * Update the PG bitset in the CPU's old partition
         */
        found = B_FALSE;
        PG_CPU_ITR_INIT(pg, cpu_itr);
        while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
            if (cpp == cp)
                continue;
            if (CPU_ACTIVE(cpp) &&
                cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
                found = B_TRUE;
                break;
            }
        }
        if (!found) {
            bitset_del(&cp->cpu_part->cp_cmt_pgs,
                ((pg_t *)pg)->pg_id);
        }
    }
}

/*
 * Return non-zero if the CPU belongs in the given PG
 */
static int
pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
{
    cpu_t   *pg_cpu;

    pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);

    ASSERT(pg_cpu != NULL);

    /*
     * The CPU belongs if, given the nature of the hardware sharing
     * relationship represented by the PG, the CPU has that
     * relationship with some other CPU already in the PG
     */
    if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
        return (1);

    return (0);
}

/*
 * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
 */
static void
pg_cmt_hier_sort(pg_cmt_t **hier, int size)
{
    int     i, j, inc, sz;
    int     start, end;
    pg_t        *tmp;
    pg_t        **h = (pg_t **)hier;

    /*
     * First sort by number of CPUs
     */
    inc = size / 2;
    while (inc > 0) {
        for (i = inc; i < size; i++) {
            j = i;
            tmp = h[i];
            while ((j >= inc) &&
                (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
                h[j] = h[j - inc];
                j = j - inc;
            }
            h[j] = tmp;
        }
        if (inc == 2)
            inc = 1;
        else
            inc = (inc * 5) / 11;
    }

    /*
     * Break ties by asking the platform.
     * Determine if h[i] outranks h[i + 1] and if so, swap them.
     */
    for (start = 0; start < size; start++) {

        /*
         * Find various contiguous sets of elements,
         * in the array, with the same number of cpus
         */
        end = start;
        sz = PG_NUM_CPUS(h[start]);
        while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
            end++;
        /*
         * Sort each such set of the array by rank
         */
        for (i = start + 1; i < end; i++) {
            j = i - 1;
            tmp = h[i];
            while (j >= start &&
                pg_cmt_hier_rank(hier[j],
                (pg_cmt_t *)tmp) == hier[j]) {
                h[j + 1] = h[j];
                j--;
            }
            h[j + 1] = tmp;
        }
    }
}

/*
 * Return a cmt_lgrp_t * given an lgroup handle.
 */
static cmt_lgrp_t *
pg_cmt_find_lgrp(lgrp_handle_t hand)
{
    cmt_lgrp_t  *lgrp;

    ASSERT(MUTEX_HELD(&cpu_lock));

    lgrp = cmt_lgrps;
    while (lgrp != NULL) {
        if (lgrp->cl_hand == hand)
            break;
        lgrp = lgrp->cl_next;
    }
    return (lgrp);
}

/*
 * Create a cmt_lgrp_t with the specified handle.
 */
static cmt_lgrp_t *
pg_cmt_lgrp_create(lgrp_handle_t hand)
{
    cmt_lgrp_t  *lgrp;

    ASSERT(MUTEX_HELD(&cpu_lock));

    lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);

    lgrp->cl_hand = hand;
    lgrp->cl_npgs = 0;
    lgrp->cl_next = cmt_lgrps;
    cmt_lgrps = lgrp;
    group_create(&lgrp->cl_pgs);

    return (lgrp);
}

/*
 * Interfaces to enable and disable power aware dispatching
 * The caller must be holding cpu_lock.
 *
 * Return 0 on success and -1 on failure.
 */
int
cmt_pad_enable(pghw_type_t type)
{
    group_t     *hwset;
    group_iter_t    iter;
    pg_cmt_t    *pg;

    ASSERT(PGHW_IS_PM_DOMAIN(type));
    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cmt_sched_disabled == 1)
        return (-1);

    if ((hwset = pghw_set_lookup(type)) == NULL ||
        cmt_hw_blacklisted[type]) {
        /*
         * Unable to find any instances of the specified type
         * of power domain, or the power domains have been blacklisted.
         */
        return (-1);
    }

    /*
     * Iterate over the power domains, setting the default dispatcher
     * policy for power/performance optimization.
     *
     * Simply setting the policy isn't enough in the case where the power
     * domain is an only child of another PG. Because the dispatcher walks
     * the PG hierarchy in a top down fashion, the higher up PG's policy
     * will dominate. So promote the power domain above it's parent if both
     * PG and it's parent have the same CPUs to ensure it's policy
     * dominates.
     */
    group_iter_init(&iter);
    while ((pg = group_iterate(hwset, &iter)) != NULL) {
        /*
         * If the power domain is an only child to a parent
         * not implementing the same policy, promote the child
         * above the parent to activate the policy.
         */
        pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
        while ((pg->cmt_parent != NULL) &&
            (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
            (PG_NUM_CPUS((pg_t *)pg) ==
            PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
            cmt_hier_promote(pg, NULL);
        }
    }

    return (0);
}

int
cmt_pad_disable(pghw_type_t type)
{
    group_t     *hwset;
    group_iter_t    iter;
    pg_cmt_t    *pg;
    pg_cmt_t    *child;

    ASSERT(PGHW_IS_PM_DOMAIN(type));
    ASSERT(MUTEX_HELD(&cpu_lock));

    if (cmt_sched_disabled == 1)
        return (-1);

    if ((hwset = pghw_set_lookup(type)) == NULL) {
        /*
         * Unable to find any instances of the specified type of
         * power domain.
         */
        return (-1);
    }
    /*
     * Iterate over the power domains, setting the default dispatcher
     * policy for performance optimization (load balancing).
     */
    group_iter_init(&iter);
    while ((pg = group_iterate(hwset, &iter)) != NULL) {

        /*
         * If the power domain has an only child that implements
         * policy other than load balancing, promote the child
         * above the power domain to ensure it's policy dominates.
         */
        if (pg->cmt_children != NULL &&
            GROUP_SIZE(pg->cmt_children) == 1) {
            child = GROUP_ACCESS(pg->cmt_children, 0);
            if ((child->cmt_policy & CMT_BALANCE) == 0) {
                cmt_hier_promote(child, NULL);
            }
        }
        pg->cmt_policy = CMT_BALANCE;
    }
    return (0);
}

/* ARGSUSED */
static void
cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
            kthread_t *new)
{
    pg_cmt_t    *cmt_pg = (pg_cmt_t *)pg;

    if (old == cp->cpu_idle_thread) {
        atomic_inc_32(&cmt_pg->cmt_utilization);
    } else if (new == cp->cpu_idle_thread) {
        atomic_dec_32(&cmt_pg->cmt_utilization);
    }
}

/*
 * Macro to test whether a thread is currently runnable on a CPU in a PG.
 */
#define THREAD_RUNNABLE_IN_PG(t, pg)                    \
    ((t)->t_state == TS_RUN &&                  \
        (t)->t_disp_queue->disp_cpu &&              \
        bitset_in_set(&(pg)->cmt_cpus_actv_set,         \
        (t)->t_disp_queue->disp_cpu->cpu_seqid))

static void
cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
    kthread_t *new)
{
    pg_cmt_t    *cmt = (pg_cmt_t *)pg;
    cpupm_domain_t  *dom;
    uint32_t    u;

    if (old == cp->cpu_idle_thread) {
        ASSERT(new != cp->cpu_idle_thread);
        u = atomic_inc_32_nv(&cmt->cmt_utilization);
        if (u == 1) {
            /*
             * Notify the CPU power manager that the domain
             * is non-idle.
             */
            dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
            cpupm_utilization_event(cp, now, dom,
                CPUPM_DOM_BUSY_FROM_IDLE);
        }
    } else if (new == cp->cpu_idle_thread) {
        ASSERT(old != cp->cpu_idle_thread);
        u = atomic_dec_32_nv(&cmt->cmt_utilization);
        if (u == 0) {
            /*
             * The domain is idle, notify the CPU power
             * manager.
             *
             * Avoid notifying if the thread is simply migrating
             * between CPUs in the domain.
             */
            if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
                dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
                cpupm_utilization_event(cp, now, dom,
                    CPUPM_DOM_IDLE_FROM_BUSY);
            }
        }
    }
}

/* ARGSUSED */
static void
cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
{
    pg_cmt_t    *cmt = (pg_cmt_t *)pg;
    cpupm_domain_t  *dom;

    dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
    cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
}

/*
 * Return the name of the CMT scheduling policy
 * being implemented across this PG
 */
static char *
pg_cmt_policy_name(pg_t *pg)
{
    pg_cmt_policy_t policy;

    policy = ((pg_cmt_t *)pg)->cmt_policy;

    if (policy & CMT_AFFINITY) {
        if (policy & CMT_BALANCE)
            return ("Load Balancing & Affinity");
        else if (policy & CMT_COALESCE)
            return ("Load Coalescence & Affinity");
        else
            return ("Affinity");
    } else {
        if (policy & CMT_BALANCE)
            return ("Load Balancing");
        else if (policy & CMT_COALESCE)
            return ("Load Coalescence");
        else
            return ("None");
    }
}

/*
 * Prune PG, and all other instances of PG's hardware sharing relationship
 * from the CMT PG hierarchy.
 *
 * This routine operates on the CPU specific processor group data (for the CPUs
 * in the PG being pruned), and may be invoked from a context where one CPU's
 * PG data is under construction. In this case the argument "pgdata", if not
 * NULL, is a reference to the CPU's under-construction PG data.
 */
static int
pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
{
    group_t     *hwset, *children;
    int     i, j, r, size = *sz;
    group_iter_t    hw_iter, child_iter;
    pg_cpu_itr_t    cpu_iter;
    pg_cmt_t    *pg, *child;
    cpu_t       *cpu;
    int     cap_needed;
    pghw_type_t hw;

    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * Inform pghw layer that this PG is pruned.
     */
    pghw_cmt_fini((pghw_t *)pg_bad);

    hw = ((pghw_t *)pg_bad)->pghw_hw;

    if (hw == PGHW_POW_ACTIVE) {
        cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
            "Event Based CPUPM Unavailable");
    } else if (hw == PGHW_POW_IDLE) {
        cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
            "Dispatcher assisted CPUPM disabled.");
    }

    /*
     * Find and eliminate the PG from the lineage.
     */
    for (i = 0; i < size; i++) {
        if (lineage[i] == pg_bad) {
            for (j = i; j < size - 1; j++)
                lineage[j] = lineage[j + 1];
            *sz = size - 1;
            break;
        }
    }

    /*
     * We'll prune all instances of the hardware sharing relationship
     * represented by pg. But before we do that (and pause CPUs) we need
     * to ensure the hierarchy's groups are properly sized.
     */
    hwset = pghw_set_lookup(hw);

    /*
     * Blacklist the hardware so future processor groups of this type won't
     * participate in CMT thread placement.
     *
     * XXX
     * For heterogeneous system configurations, this might be overkill.
     * We may only need to blacklist the illegal PGs, and other instances
     * of this hardware sharing relationship may be ok.
     */
    cmt_hw_blacklisted[hw] = 1;

    /*
     * For each of the PGs being pruned, ensure sufficient capacity in
     * the siblings set for the PG's children
     */
    group_iter_init(&hw_iter);
    while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
        /*
         * PG is being pruned, but if it is bringing up more than
         * one child, ask for more capacity in the siblings group.
         */
        cap_needed = 0;
        if (pg->cmt_children &&
            GROUP_SIZE(pg->cmt_children) > 1) {
            cap_needed = GROUP_SIZE(pg->cmt_children) - 1;

            group_expand(pg->cmt_siblings,
                GROUP_SIZE(pg->cmt_siblings) + cap_needed);

            /*
             * If this is a top level group, also ensure the
             * capacity in the root lgrp level CMT grouping.
             */
            if (pg->cmt_parent == NULL &&
                pg->cmt_siblings != &cmt_root->cl_pgs) {
                group_expand(&cmt_root->cl_pgs,
                    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
                cmt_root->cl_npgs += cap_needed;
            }
        }
    }

    /*
     * We're operating on the PG hierarchy. Pause CPUs to ensure
     * exclusivity with respect to the dispatcher.
     */
    pause_cpus(NULL, NULL);

    /*
     * Prune all PG instances of the hardware sharing relationship
     * represented by pg.
     */
    group_iter_init(&hw_iter);
    while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {

        /*
         * Remove PG from it's group of siblings, if it's there.
         */
        if (pg->cmt_siblings) {
            (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
        }
        if (pg->cmt_parent == NULL &&
            pg->cmt_siblings != &cmt_root->cl_pgs) {
            (void) group_remove(&cmt_root->cl_pgs, pg,
                GRP_NORESIZE);
        }

        /*
         * Indicate that no CMT policy will be implemented across
         * this PG.
         */
        pg->cmt_policy = CMT_NO_POLICY;

        /*
         * Move PG's children from it's children set to it's parent's
         * children set. Note that the parent's children set, and PG's
         * siblings set are the same thing.
         *
         * Because we are iterating over the same group that we are
         * operating on (removing the children), first add all of PG's
         * children to the parent's children set, and once we are done
         * iterating, empty PG's children set.
         */
        if (pg->cmt_children != NULL) {
            children = pg->cmt_children;

            group_iter_init(&child_iter);
            while ((child = group_iterate(children, &child_iter))
                != NULL) {
                if (pg->cmt_siblings != NULL) {
                    r = group_add(pg->cmt_siblings, child,
                        GRP_NORESIZE);
                    ASSERT(r == 0);

                    if (pg->cmt_parent == NULL &&
                        pg->cmt_siblings !=
                        &cmt_root->cl_pgs) {
                        r = group_add(&cmt_root->cl_pgs,
                            child, GRP_NORESIZE);
                        ASSERT(r == 0);
                    }
                }
            }
            group_empty(pg->cmt_children);
        }

        /*
         * Reset the callbacks to the defaults
         */
        pg_callback_set_defaults((pg_t *)pg);

        /*
         * Update all the CPU lineages in each of PG's CPUs
         */
        PG_CPU_ITR_INIT(pg, cpu_iter);
        while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
            pg_cmt_t    *cpu_pg;
            group_iter_t    liter;  /* Iterator for the lineage */
            cpu_pg_t    *cpd;   /* CPU's PG data */

            /*
             * The CPU's lineage is under construction still
             * references the bootstrap CPU PG data structure.
             */
            if (pg_cpu_is_bootstrapped(cpu))
                cpd = pgdata;
            else
                cpd = cpu->cpu_pg;

            /*
             * Iterate over the CPU's PGs updating the children
             * of the PG being promoted, since they have a new
             * parent and siblings set.
             */
            group_iter_init(&liter);
            while ((cpu_pg = group_iterate(&cpd->pgs,
                &liter)) != NULL) {
                if (cpu_pg->cmt_parent == pg) {
                    cpu_pg->cmt_parent = pg->cmt_parent;
                    cpu_pg->cmt_siblings = pg->cmt_siblings;
                }
            }

            /*
             * Update the CPU's lineages
             *
             * Remove the PG from the CPU's group used for CMT
             * scheduling.
             */
            (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
        }
    }
    start_cpus();
    return (0);
}

/*
 * Disable CMT scheduling
 */
static void
pg_cmt_disable(void)
{
    cpu_t       *cpu;

    ASSERT(MUTEX_HELD(&cpu_lock));

    pause_cpus(NULL, NULL);
    cpu = cpu_list;

    do {
        if (cpu->cpu_pg)
            group_empty(&cpu->cpu_pg->cmt_pgs);
    } while ((cpu = cpu->cpu_next) != cpu_list);

    cmt_sched_disabled = 1;
    start_cpus();
    cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
}

/*
 * CMT lineage validation
 *
 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
 * of the PGs in a CPU's lineage. This is necessary because it's possible that
 * some groupings (power domain groupings in particular) may be defined by
 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
 * possible to integrate those groupings into the CMT PG hierarchy, if doing
 * so would violate the subset invariant of the hierarchy, which says that
 * a PG must be subset of its parent (if it has one).
 *
 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
 * would result in a violation of this invariant. If a violation is found,
 * and the PG is of a grouping type who's definition is known to originate from
 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
 * PG (and all other instances PG's sharing relationship type) from the CMT
 * hierarchy. Further, future instances of that sharing relationship type won't
 * be added. If the grouping definition doesn't originate from suspect
 * sources, then pg_cmt_disable() will be invoked to log an error, and disable
 * CMT scheduling altogether.
 *
 * This routine is invoked after the CPU has been added to the PGs in which
 * it belongs, but before those PGs have been added to (or had their place
 * adjusted in) the CMT PG hierarchy.
 *
 * The first argument is the CPUs PG lineage (essentially an array of PGs in
 * which the CPU belongs) that has already been sorted in ascending order
 * by CPU count. Some of the PGs in the CPUs lineage may already have other
 * CPUs in them, and have already been integrated into the CMT hierarchy.
 *
 * The addition of this new CPU to these pre-existing PGs means that those
 * PGs may need to be promoted up in the hierarchy to satisfy the subset
 * invariant. In additon to testing the subset invariant for the lineage,
 * this routine also verifies that the addition of the new CPU to the
 * existing PGs wouldn't cause the subset invariant to be violated in
 * the exiting lineages.
 *
 * This routine will normally return one of the following:
 * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
 *
 * Otherwise, this routine will return a value indicating which error it
 * was unable to recover from (and set cmt_lineage_status along the way).
 *
 * This routine operates on the CPU specific processor group data (for the CPU
 * whose lineage is being validated), which is under-construction.
 * "pgdata" is a reference to the CPU's under-construction PG data.
 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
 */
static cmt_lineage_validation_t
pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
{
    int     i, j, size;
    pg_cmt_t    *pg, *pg_next, *pg_bad, *pg_tmp, *parent;
    cpu_t       *cp;
    pg_cpu_itr_t    cpu_iter;
    lgrp_handle_t   lgrp;

    ASSERT(MUTEX_HELD(&cpu_lock));

revalidate:
    size = *sz;
    pg_bad = NULL;
    lgrp = LGRP_NULL_HANDLE;
    for (i = 0; i < size; i++) {

        pg = lineage[i];
        if (i < size - 1)
            pg_next = lineage[i + 1];
        else
            pg_next = NULL;

        /*
         * We assume that the lineage has already been sorted
         * by the number of CPUs. In fact, we depend on it.
         */
        ASSERT(pg_next == NULL ||
            (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));

        /*
         * The CPUs PG lineage was passed as the first argument to
         * this routine and contains the sorted list of the CPU's
         * PGs. Ultimately, the ordering of the PGs in that list, and
         * the ordering as traversed by the cmt_parent list must be
         * the same. PG promotion will be used as the mechanism to
         * achieve this, but first we need to look for cases where
         * promotion will be necessary, and validate that will be
         * possible without violating the subset invarient described
         * above.
         *
         * Since the PG topology is in the middle of being changed, we
         * need to check whether the PG's existing parent (if any) is
         * part of this CPU's lineage (and therefore should contain
         * the new CPU). If not, it means that the addition of the
         * new CPU should have made this PG have more CPUs than its
         * parent (and other ancestors not in the same lineage) and
         * will need to be promoted into place.
         *
         * We need to verify all of this to defend against a buggy
         * BIOS giving bad power domain CPU groupings. Sigh.
         */
        parent = pg->cmt_parent;
        while (parent != NULL) {
            /*
             * Determine if the parent/ancestor is in this lineage
             */
            pg_tmp = NULL;
            for (j = 0; (j < size) && (pg_tmp != parent); j++) {
                pg_tmp = lineage[j];
            }
            if (pg_tmp == parent) {
                /*
                 * It's in the lineage. The concentricity
                 * checks will handle the rest.
                 */
                break;
            }
            /*
             * If it is not in the lineage, PG will eventually
             * need to be promoted above it. Verify the ancestor
             * is a proper subset. There is still an error if
             * the ancestor has the same number of CPUs as PG,
             * since that would imply it should be in the lineage,
             * and we already know it isn't.
             */
            if (PG_NUM_CPUS((pg_t *)parent) >=
                PG_NUM_CPUS((pg_t *)pg)) {
                /*
                 * Not a proper subset if the parent/ancestor
                 * has the same or more CPUs than PG.
                 */
                cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
                goto handle_error;
            }
            parent = parent->cmt_parent;
        }

        /*
         * Walk each of the CPUs in the PGs group and perform
         * consistency checks along the way.
         */
        PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
        while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
            /*
             * Verify that there aren't any CPUs contained in PG
             * that the next PG in the lineage (which is larger
             * or same size) doesn't also contain.
             */
            if (pg_next != NULL &&
                pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
                cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
                goto handle_error;
            }

            /*
             * Verify that all the CPUs in the PG are in the same
             * lgroup.
             */
            if (lgrp == LGRP_NULL_HANDLE) {
                lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
            } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
                cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
                goto handle_error;
            }
        }
    }

handle_error:
    /*
     * Some of these validation errors can result when the CPU grouping
     * information is derived from buggy sources (for example, incorrect
     * ACPI tables on x86 systems).
     *
     * We'll try to recover in such cases by pruning out the illegal
     * groupings from the PG hierarchy, which means that we won't optimize
     * for those levels, but we will for the remaining ones.
     */
    switch (cmt_lineage_status) {
    case CMT_LINEAGE_VALID:
    case CMT_LINEAGE_REPAIRED:
        break;
    case CMT_LINEAGE_PG_SPANS_LGRPS:
        /*
         * We've detected a PG whose CPUs span lgroups.
         *
         * This isn't supported, as the dispatcher isn't allowed to
         * to do CMT thread placement across lgroups, as this would
         * conflict with policies implementing MPO thread affinity.
         *
         * If the PG is of a sharing relationship type known to
         * legitimately span lgroups, specify that no CMT thread
         * placement policy should be implemented, and prune the PG
         * from the existing CMT PG hierarchy.
         *
         * Otherwise, fall though to the case below for handling.
         */
        if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
            if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
                cmt_lineage_status = CMT_LINEAGE_REPAIRED;
                goto revalidate;
            }
        }
        /*LINTED*/
    case CMT_LINEAGE_NON_PROMOTABLE:
        /*
         * We've detected a PG that already exists in another CPU's
         * lineage that cannot cannot legally be promoted into place
         * without breaking the invariants of the hierarchy.
         */
        if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
            if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
                cmt_lineage_status = CMT_LINEAGE_REPAIRED;
                goto revalidate;
            }
        }
        /*
         * Something went wrong trying to prune out the bad level.
         * Disable CMT scheduling altogether.
         */
        pg_cmt_disable();
        break;
    case CMT_LINEAGE_NON_CONCENTRIC:
        /*
         * We've detected a non-concentric PG lineage, which means that
         * there's a PG in the lineage that has CPUs that the next PG
         * over in the lineage (which is the same size or larger)
         * doesn't have.
         *
         * In this case, we examine the two PGs to see if either
         * grouping is defined by potentially buggy sources.
         *
         * If one has less CPUs than the other, and contains CPUs
         * not found in the parent, and it is an untrusted enumeration,
         * then prune it. If both have the same number of CPUs, then
         * prune the one that is untrusted.
         *
         * This process repeats until we have a concentric lineage,
         * or we would have to prune out level derived from what we
         * thought was a reliable source, in which case CMT scheduling
         * is disabled altogether.
         */
        if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
            (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
            pg_bad = pg;
        } else if (PG_NUM_CPUS((pg_t *)pg) ==
            PG_NUM_CPUS((pg_t *)pg_next)) {
            if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
                pg_bad = pg_next;
            } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
                pg_bad = pg;
            }
        }
        if (pg_bad) {
            if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
                cmt_lineage_status = CMT_LINEAGE_REPAIRED;
                goto revalidate;
            }
        }
        /*
         * Something went wrong trying to identify and/or prune out
         * the bad level. Disable CMT scheduling altogether.
         */
        pg_cmt_disable();
        break;
    default:
        /*
         * If we're here, we've encountered a validation error for
         * which we don't know how to recover. In this case, disable
         * CMT scheduling altogether.
         */
        cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
        pg_cmt_disable();
    }
    return (cmt_lineage_status);
}