/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#include <sys/processor.h>
/*
* CMT scheduler / dispatcher support
*
* This file implements CMT scheduler support using Processor Groups.
* The CMT processor group class creates and maintains the CMT class
* specific processor group pg_cmt_t.
*
* ---------------------------- <-- pg_cmt_t *
* | pghw_t |
* ----------------------------
* | CMT class specific data |
* | - hierarchy linkage |
* | - CMT load balancing data|
* ----------------------------
*
* The scheduler/dispatcher leverages knowledge of the performance
* relevant CMT sharing relationships existing between cpus to implement
* optimized affinity, load balancing, and coalescence policies.
*
* Load balancing policy seeks to improve performance by minimizing
* contention over shared processor resources / facilities, Affinity
* policies seek to improve cache and TLB utilization. Coalescence
* policies improve resource utilization and ultimately power efficiency.
*
* The CMT PGs created by this class are already arranged into a
* hierarchy (which is done in the pghw layer). To implement the top-down
* CMT load balancing algorithm, the CMT PGs additionally maintain
* parent, child and sibling hierarchy relationships.
* Parent PGs always contain a superset of their children(s) resources,
* each PG can have at most one parent, and siblings are the group of PGs
* sharing the same parent.
*
* On UMA based systems, the CMT load balancing algorithm begins by balancing
* load across the group of top level PGs in the system hierarchy.
* On NUMA systems, the CMT load balancing algorithm balances load across the
* group of top level PGs in each leaf lgroup...but for root homed threads,
* is willing to balance against all the top level PGs in the system.
*
* Groups of top level PGs are maintained to implement the above, one for each
* leaf lgroup (containing the top level PGs in that lgroup), and one (for the
* root lgroup) that contains all the top level PGs in the system.
*/
/* used for null_proc_lpa */
/*
* Array of hardware sharing relationships that are blacklisted.
* CMT scheduling optimizations won't be performed for blacklisted sharing
* relationships.
*/
/*
* Set this to non-zero to disable CMT scheduling
*/
int cmt_sched_disabled = 0;
/*
* Status codes for CMT lineage validation
* See pg_cmt_lineage_validate() below
*/
typedef enum cmt_lineage_validation {
/*
* Status of the current lineage under construction.
* One must be holding cpu_lock to change this.
*/
/*
* Power domain definitions (on x86) are defined by ACPI, and
* therefore may be subject to BIOS bugs.
*/
/*
* Macro to test if PG is managed by the CMT PG class
*/
static pg_t *pg_cmt_alloc();
static void pg_cmt_free(pg_t *);
static void pg_cmt_cpu_active(cpu_t *);
static void pg_cmt_cpu_inactive(cpu_t *);
static char *pg_cmt_policy_name(pg_t *);
static void pg_cmt_hier_sort(pg_cmt_t **, int);
static int pg_cmt_hw(pghw_type_t);
cpu_pg_t *);
/*
* CMT PG ops
*/
NULL, /* cpupart_out */
};
/*
* Initialize the CMT PG class
*/
void
pg_cmt_class_init(void)
{
if (cmt_sched_disabled)
return;
}
/*
* Called to indicate a new CPU has started up so
* that either t0 or the slave startup thread can
* be accounted for.
*/
void
{
cp->cpu_thread);
}
/*
* Return non-zero if thread can migrate between "from" and "to"
* without a performance penalty
*/
int
{
return (1);
return (0);
}
/*
* CMT class specific PG allocation
*/
static pg_t *
pg_cmt_alloc(void)
{
}
/*
* Class specific PG de-allocation
*/
static void
{
}
/*
* Given a hardware sharing relationship, return which dispatcher
* policies should be implemented to optimize performance and efficiency
*/
static pg_cmt_policy_t
{
/*
* Give the platform a chance to override the default
*/
return (p);
switch (hw) {
case PGHW_IPIPE:
case PGHW_FPU:
case PGHW_PROCNODE:
case PGHW_CHIP:
return (CMT_BALANCE);
case PGHW_CACHE:
return (CMT_AFFINITY | CMT_BALANCE);
case PGHW_POW_ACTIVE:
case PGHW_POW_IDLE:
return (CMT_BALANCE);
default:
return (CMT_NO_POLICY);
}
}
/*
* Rank the importance of optimizing for the pg1 relationship vs.
* the pg2 relationship.
*/
static pg_cmt_t *
{
/*
* A power domain is only important if CPUPM is enabled.
*/
if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
return (pg2);
return (pg1);
}
/*
* Otherwise, ask the platform
*/
return (pg1);
else
return (pg2);
}
/*
* Initialize CMT callbacks for the given PG
*/
static void
{
/*
* Stick with the default callbacks if there isn't going to be
* any CMT thread placement optimizations implemented.
*/
return;
case PGHW_POW_ACTIVE:
break;
default:
}
}
/*
* Promote PG above it's current parent.
* This is only legal if PG has an equal or greater number of CPUs than its
* parent.
*
* This routine operates on the CPU specific processor group data (for the CPUs
* in the PG being promoted), and may be invoked from a context where one CPU's
* PG data is under construction. In this case the argument "pgdata", if not
* NULL, is a reference to the CPU's under-construction PG data.
*/
static void
{
int r;
int err;
int nchildren;
/*
* Nothing to do
*/
return;
}
/*
* We're changing around the hierarchy, which is actively traversed
* by the dispatcher. Pause CPUS to ensure exclusivity.
*/
/*
* If necessary, update the parent's sibling set, replacing parent
* with PG.
*/
if (parent->cmt_siblings) {
!= -1) {
ASSERT(r != -1);
}
}
/*
* If the parent is at the top of the hierarchy, replace it's entry
* in the root lgroup's group of top level PGs.
*/
!= -1) {
ASSERT(r != -1);
}
}
/*
* We assume (and therefore assert) that the PG being promoted is an
* only child of it's parent. Update the parent's children set
* replacing PG's entry with the parent (since the parent is becoming
* the child). Then have PG and the parent swap children sets and
* children counts.
*/
ASSERT(r != -1);
}
/*
* Update the sibling references for PG and it's parent
*/
/*
* Update any cached lineages in the per CPU pg data.
*/
int idx;
int sz;
/*
* The CPU's whose lineage is under construction still
* references the bootstrap CPU PG data structure.
*/
if (pg_cpu_is_bootstrapped(cpu))
else
/*
* Iterate over the CPU's PGs updating the children
* of the PG being promoted, since they have a new parent.
*/
}
}
/*
* Update the CMT load balancing lineage
*/
/*
* Unless this is the CPU who's lineage is being
* constructed, the PG being promoted should be
* in the lineage.
*/
continue;
}
/*
* Have the child and the parent swap places in the CPU's
* lineage
*/
/*
* Ensure cmt_lineage references CPU's leaf PG.
* Since cmt_pgs is top-down ordered, the bottom is the last
* element.
*/
}
/*
* Update the parent references for PG and it's parent
*/
start_cpus();
}
/*
* CMT class callback for a new CPU entering the system
*
* This routine operates on the CPU specific processor group data (for the CPU
* being initialized). The argument "pgdata" is a reference to the CPU's PG
* data to be constructed.
*
* cp->cpu_pg is used by the dispatcher to access the CPU's PG data
* references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
* calls must be careful to operate only on the "pgdata" argument, and not
* cp->cpu_pg.
*/
static void
{
if (cmt_sched_disabled)
return;
/*
* A new CPU is coming into the system.
* Interrogate the platform to see if the CPU
* has any performance or efficiency relevant
* sharing relationships
*/
levels = 0;
/*
* We're only interested in the hw sharing relationships
* for which we know how to optimize.
*/
if (policy == CMT_NO_POLICY ||
continue;
/*
* We will still create the PGs for hardware sharing
* relationships that have been blacklisted, but won't
* implement CMT thread placement optimizations against them.
*/
/*
* Find (or create) the PG associated with
* the hw sharing relationship in which cp
* belongs.
*
* Determine if a suitable PG already
* exists, or if one needs to be created.
*/
/*
* Create a new one.
* Initialize the common...
*/
/* ... physical ... */
/*
* ... and CMT specific portions of the
* structure.
*/
/* CMT event callbacks */
} else {
}
/* Add the CPU to the PG */
/*
*/
}
/*
* Build a lineage of CMT PGs for load balancing / coalescence
*/
}
/* Cache this for later */
if (hw == PGHW_CACHE)
}
/*
* Find the lgrp that encapsulates this CPU's CMT hierarchy
*/
/*
* Ascendingly sort the PGs in the lineage by number of CPUs
*/
/*
* Examine the lineage and validate it.
* This routine will also try to fix the lineage along with the
* rest of the PG hierarchy should it detect an issue.
*
* If it returns anything other than VALID or REPAIRED, an
* unrecoverable error has occurred, and we cannot proceed.
*/
if ((lineage_status != CMT_LINEAGE_VALID) &&
(lineage_status != CMT_LINEAGE_REPAIRED)) {
/*
* In the case of an unrecoverable error where CMT scheduling
* has been disabled, assert that the under construction CPU's
* PG data has an empty CMT load balancing lineage.
*/
ASSERT((cmt_sched_disabled == 0) ||
return;
}
/*
* For existing PGs in the lineage, verify that the parent is
* correct, as the generation in the lineage may have changed
* as a result of the sorting. Start the traversal at the top
* of the lineage, moving down.
*/
int reorg;
reorg = 0;
/*
* Promote PGs at an incorrect generation into place.
*/
while (pg->cmt_parent &&
reorg++;
}
if (reorg > 0)
else
level--;
}
/*
* For each of the PGs in the CPU's lineage:
* - Add an entry in the CPU sorted CMT PG group
* which is used for top down CMT load balancing
* - Tie the PG into the CMT hierarchy by connecting
* it to it's parent and siblings.
*/
int err;
if (level == 0)
/* Already initialized */
continue;
}
} else {
/*
* A good parent keeps track of their children.
* The parent's children group is also the PG's
* siblings.
*/
}
}
}
/*
* Cache the chip and core IDs in the cpu_t->cpu_physid structure
* for fast lookups later.
*/
if (cp->cpu_physid) {
/*
* If this cpu has a PG representing shared cache, then set
* cpu_cacheid to that PG's logical id
*/
if (pg_cache)
}
/* CPU0 only initialization */
if (is_cpu0) {
is_cpu0 = 0;
}
}
/*
* Class callback when a CPU is leaving the system (deletion)
*
* "pgdata" is a reference to the CPU's PG data to be deconstructed.
*
* cp->cpu_pg is used by the dispatcher to access the CPU's PG data
* references a "bootstrap" structure across this function's invocation.
* pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
* on the "pgdata" argument, and not cp->cpu_pg.
*/
static void
{
group_iter_t i;
if (cmt_sched_disabled)
return;
/*
* Find the lgroup that encapsulates this CPU's CMT hierarchy
*/
/*
* One might wonder how we could be deconfiguring the
* only CPU in the system.
*
* On Starcat systems when null_proc_lpa is detected,
* the boot CPU (which is already configured into a leaf
* lgroup), is moved into the root lgroup. This is done by
* deconfiguring it from both lgroups and processor
* groups), and then later reconfiguring it back in. This
* call to pg_cmt_cpu_fini() is part of that deconfiguration.
*
* This special case is detected by noting that the platform
* has changed the CPU's lgrp affiliation (since it now
* belongs in the root). In this case, use the cmt_lgrp_t
* cached for the boot CPU, since this is what needs to be
* torn down.
*/
}
/*
* First, clean up anything load balancing specific for each of
* the CPU's PGs that participated in CMT load balancing
*/
/*
* Remove the PG from the CPU's load balancing lineage
*/
/*
* If it's about to become empty, destroy it's children
* group, and remove it's reference from it's siblings.
* This is done here (rather than below) to avoid removing
* our reference from a PG that we just eliminated.
*/
else
}
}
}
/*
* Now that the load balancing lineage updates have happened,
* remove the CPU from all it's PGs (destroying any that become
* empty).
*/
group_iter_init(&i);
continue;
/*
* Deleting the CPU from the PG changes the CPU's
* PG group over which we are actively iterating
* Re-initialize the iteration
*/
group_iter_init(&i);
/*
* The PG has become zero sized, so destroy it.
*/
}
}
}
/*
* Class callback when a CPU is entering a cpu partition
*/
static void
{
group_iter_t i;
if (cmt_sched_disabled)
return;
/*
* Ensure that the new partition's PG bitset
* is large enough for all CMT PG's to which cp
* belongs
*/
group_iter_init(&i);
continue;
}
}
/*
* Class callback when a CPU is actually moving partitions
*/
static void
{
if (cmt_sched_disabled)
return;
/*
* Iterate over the CPUs CMT PGs
*/
continue;
/*
* Add the PG to the bitset in the new partition.
*/
/*
* Remove the PG from the bitset in the old partition
* if the last of the PG's CPUs have left.
*/
continue;
if (CPU_ACTIVE(cpp) &&
break;
}
}
if (!found)
}
}
/*
* Class callback when a CPU becomes active (online)
*
* This is called in a context where CPUs are paused
*/
static void
{
int err;
group_iter_t i;
if (cmt_sched_disabled)
return;
group_iter_init(&i);
/*
* Iterate over the CPU's PGs
*/
continue;
/*
* Move to the next generation since topology is changing
*/
/*
* If this is the first active CPU in the PG, and it
* represents a hardware sharing relationship over which
* CMT load balancing is performed, add it as a candidate
* for balancing with it's siblings.
*/
/*
* If this is a top level PG, add it as a balancing
* candidate when balancing within the root lgroup.
*/
}
}
/*
* Notate the CPU in the PGs active CPU bitset.
* Also notate the PG as being active in it's associated
* partition
*/
}
}
/*
* Class callback when a CPU goes inactive (offline)
*
* This is called in a context where CPUs are paused
*/
static void
{
int err;
group_iter_t i;
if (cmt_sched_disabled)
return;
group_iter_init(&i);
continue;
/*
* Move to the next generation since topology is changing
*/
/*
* Remove the CPU from the CMT PGs active CPU group
* bitmap
*/
/*
* If there are no more active CPUs in this PG over which
* load was balanced, remove it as a balancing candidate.
*/
}
}
/*
* Assert the number of active CPUs does not exceed
* the total number of CPUs in the PG
*/
/*
* Update the PG bitset in the CPU's old partition
*/
continue;
if (CPU_ACTIVE(cpp) &&
break;
}
}
if (!found) {
}
}
}
/*
* Return non-zero if the CPU belongs in the given PG
*/
static int
{
/*
* The CPU belongs if, given the nature of the hardware sharing
* relationship represented by the PG, the CPU has that
* relationship with some other CPU already in the PG
*/
return (1);
return (0);
}
/*
* Sort the CPUs CMT hierarchy, where "size" is the number of levels.
*/
static void
{
/*
* First sort by number of CPUs
*/
while (inc > 0) {
j = i;
tmp = h[i];
while ((j >= inc) &&
h[j] = h[j - inc];
j = j - inc;
}
h[j] = tmp;
}
if (inc == 2)
inc = 1;
else
}
/*
* Break ties by asking the platform.
* Determine if h[i] outranks h[i + 1] and if so, swap them.
*/
/*
* Find various contiguous sets of elements,
* in the array, with the same number of cpus
*/
end++;
/*
* Sort each such set of the array by rank
*/
j = i - 1;
tmp = h[i];
while (j >= start &&
pg_cmt_hier_rank(hier[j],
h[j + 1] = h[j];
j--;
}
h[j + 1] = tmp;
}
}
}
/*
* Return a cmt_lgrp_t * given an lgroup handle.
*/
static cmt_lgrp_t *
{
break;
}
return (lgrp);
}
/*
* Create a cmt_lgrp_t with the specified handle.
*/
static cmt_lgrp_t *
{
return (lgrp);
}
/*
* Interfaces to enable and disable power aware dispatching
* The caller must be holding cpu_lock.
*
* Return 0 on success and -1 on failure.
*/
int
{
if (cmt_sched_disabled == 1)
return (-1);
/*
* Unable to find any instances of the specified type
* of power domain, or the power domains have been blacklisted.
*/
return (-1);
}
/*
* Iterate over the power domains, setting the default dispatcher
* policy for power/performance optimization.
*
* Simply setting the policy isn't enough in the case where the power
* domain is an only child of another PG. Because the dispatcher walks
* the PG hierarchy in a top down fashion, the higher up PG's policy
* will dominate. So promote the power domain above it's parent if both
* PG and it's parent have the same CPUs to ensure it's policy
* dominates.
*/
/*
* If the power domain is an only child to a parent
* not implementing the same policy, promote the child
* above the parent to activate the policy.
*/
}
}
return (0);
}
int
{
if (cmt_sched_disabled == 1)
return (-1);
/*
* Unable to find any instances of the specified type of
* power domain.
*/
return (-1);
}
/*
* Iterate over the power domains, setting the default dispatcher
* policy for performance optimization (load balancing).
*/
/*
* If the power domain has an only child that implements
* policy other than load balancing, promote the child
* above the power domain to ensure it's policy dominates.
*/
}
}
}
return (0);
}
/* ARGSUSED */
static void
{
}
}
/*
* Macro to test whether a thread is currently runnable on a CPU in a PG.
*/
(t)->t_disp_queue->disp_cpu && \
static void
{
uint32_t u;
if (u == 1) {
/*
* Notify the CPU power manager that the domain
* is non-idle.
*/
}
if (u == 0) {
/*
* The domain is idle, notify the CPU power
* manager.
*
* Avoid notifying if the thread is simply migrating
* between CPUs in the domain.
*/
}
}
}
}
/* ARGSUSED */
static void
{
}
/*
* Return the name of the CMT scheduling policy
* being implemented across this PG
*/
static char *
{
if (policy & CMT_AFFINITY) {
if (policy & CMT_BALANCE)
return ("Load Balancing & Affinity");
else if (policy & CMT_COALESCE)
return ("Load Coalescence & Affinity");
else
return ("Affinity");
} else {
if (policy & CMT_BALANCE)
return ("Load Balancing");
else if (policy & CMT_COALESCE)
return ("Load Coalescence");
else
return ("None");
}
}
/*
* Prune PG, and all other instances of PG's hardware sharing relationship
* from the CMT PG hierarchy.
*
* This routine operates on the CPU specific processor group data (for the CPUs
* in the PG being pruned), and may be invoked from a context where one CPU's
* PG data is under construction. In this case the argument "pgdata", if not
* NULL, is a reference to the CPU's under-construction PG data.
*/
static int
{
int cap_needed;
/*
* Inform pghw layer that this PG is pruned.
*/
if (hw == PGHW_POW_ACTIVE) {
"Event Based CPUPM Unavailable");
} else if (hw == PGHW_POW_IDLE) {
"Dispatcher assisted CPUPM disabled.");
}
/*
* Find and eliminate the PG from the lineage.
*/
for (i = 0; i < size; i++) {
for (j = i; j < size - 1; j++)
break;
}
}
/*
* We'll prune all instances of the hardware sharing relationship
* represented by pg. But before we do that (and pause CPUs) we need
* to ensure the hierarchy's groups are properly sized.
*/
/*
* Blacklist the hardware so future processor groups of this type won't
* participate in CMT thread placement.
*
* XXX
* For heterogeneous system configurations, this might be overkill.
* We may only need to blacklist the illegal PGs, and other instances
* of this hardware sharing relationship may be ok.
*/
/*
* For each of the PGs being pruned, ensure sufficient capacity in
* the siblings set for the PG's children
*/
/*
* PG is being pruned, but if it is bringing up more than
* one child, ask for more capacity in the siblings group.
*/
cap_needed = 0;
if (pg->cmt_children &&
/*
* If this is a top level group, also ensure the
* capacity in the root lgrp level CMT grouping.
*/
}
}
}
/*
* We're operating on the PG hierarchy. Pause CPUs to ensure
* exclusivity with respect to the dispatcher.
*/
/*
* Prune all PG instances of the hardware sharing relationship
* represented by pg.
*/
/*
* Remove PG from it's group of siblings, if it's there.
*/
if (pg->cmt_siblings) {
}
}
/*
* Indicate that no CMT policy will be implemented across
* this PG.
*/
/*
* Move PG's children from it's children set to it's parent's
* children set. Note that the parent's children set, and PG's
* siblings set are the same thing.
*
* Because we are iterating over the same group that we are
* operating on (removing the children), first add all of PG's
* children to the parent's children set, and once we are done
* iterating, empty PG's children set.
*/
!= NULL) {
ASSERT(r == 0);
pg->cmt_siblings !=
ASSERT(r == 0);
}
}
}
}
/*
* Reset the callbacks to the defaults
*/
/*
* Update all the CPU lineages in each of PG's CPUs
*/
/*
* The CPU's lineage is under construction still
* references the bootstrap CPU PG data structure.
*/
if (pg_cpu_is_bootstrapped(cpu))
else
/*
* Iterate over the CPU's PGs updating the children
* of the PG being promoted, since they have a new
* parent and siblings set.
*/
}
}
/*
* Update the CPU's lineages
*
* Remove the PG from the CPU's group used for CMT
* scheduling.
*/
}
}
start_cpus();
return (0);
}
/*
* Disable CMT scheduling
*/
static void
pg_cmt_disable(void)
{
do {
cmt_sched_disabled = 1;
start_cpus();
}
/*
* CMT lineage validation
*
* This routine is invoked by pg_cmt_cpu_init() to validate the integrity
* of the PGs in a CPU's lineage. This is necessary because it's possible that
* some groupings (power domain groupings in particular) may be defined by
* sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
* possible to integrate those groupings into the CMT PG hierarchy, if doing
* so would violate the subset invariant of the hierarchy, which says that
* a PG must be subset of its parent (if it has one).
*
* pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
* would result in a violation of this invariant. If a violation is found,
* and the PG is of a grouping type who's definition is known to originate from
* suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
* PG (and all other instances PG's sharing relationship type) from the CMT
* hierarchy. Further, future instances of that sharing relationship type won't
* be added. If the grouping definition doesn't originate from suspect
* sources, then pg_cmt_disable() will be invoked to log an error, and disable
* CMT scheduling altogether.
*
* This routine is invoked after the CPU has been added to the PGs in which
* it belongs, but before those PGs have been added to (or had their place
* adjusted in) the CMT PG hierarchy.
*
* The first argument is the CPUs PG lineage (essentially an array of PGs in
* which the CPU belongs) that has already been sorted in ascending order
* by CPU count. Some of the PGs in the CPUs lineage may already have other
* CPUs in them, and have already been integrated into the CMT hierarchy.
*
* The addition of this new CPU to these pre-existing PGs means that those
* PGs may need to be promoted up in the hierarchy to satisfy the subset
* invariant. In additon to testing the subset invariant for the lineage,
* this routine also verifies that the addition of the new CPU to the
* existing PGs wouldn't cause the subset invariant to be violated in
* the exiting lineages.
*
* This routine will normally return one of the following:
* CMT_LINEAGE_VALID - There were no problems detected with the lineage.
* CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
*
* Otherwise, this routine will return a value indicating which error it
* was unable to recover from (and set cmt_lineage_status along the way).
*
* This routine operates on the CPU specific processor group data (for the CPU
* whose lineage is being validated), which is under-construction.
* "pgdata" is a reference to the CPU's under-construction PG data.
* This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
*/
static cmt_lineage_validation_t
{
int i, j, size;
for (i = 0; i < size; i++) {
if (i < size - 1)
else
/*
* We assume that the lineage has already been sorted
* by the number of CPUs. In fact, we depend on it.
*/
/*
* The CPUs PG lineage was passed as the first argument to
* this routine and contains the sorted list of the CPU's
* PGs. Ultimately, the ordering of the PGs in that list, and
* the ordering as traversed by the cmt_parent list must be
* the same. PG promotion will be used as the mechanism to
* achieve this, but first we need to look for cases where
* promotion will be necessary, and validate that will be
* possible without violating the subset invarient described
* above.
*
* Since the PG topology is in the middle of being changed, we
* need to check whether the PG's existing parent (if any) is
* part of this CPU's lineage (and therefore should contain
* the new CPU). If not, it means that the addition of the
* new CPU should have made this PG have more CPUs than its
* parent (and other ancestors not in the same lineage) and
* will need to be promoted into place.
*
* We need to verify all of this to defend against a buggy
* BIOS giving bad power domain CPU groupings. Sigh.
*/
/*
*/
}
/*
* It's in the lineage. The concentricity
* checks will handle the rest.
*/
break;
}
/*
* If it is not in the lineage, PG will eventually
* need to be promoted above it. Verify the ancestor
* is a proper subset. There is still an error if
* the ancestor has the same number of CPUs as PG,
* since that would imply it should be in the lineage,
* and we already know it isn't.
*/
/*
* has the same or more CPUs than PG.
*/
goto handle_error;
}
}
/*
* Walk each of the CPUs in the PGs group and perform
* consistency checks along the way.
*/
/*
* Verify that there aren't any CPUs contained in PG
* that the next PG in the lineage (which is larger
* or same size) doesn't also contain.
*/
goto handle_error;
}
/*
* Verify that all the CPUs in the PG are in the same
* lgroup.
*/
if (lgrp == LGRP_NULL_HANDLE) {
goto handle_error;
}
}
}
/*
* Some of these validation errors can result when the CPU grouping
* information is derived from buggy sources (for example, incorrect
* ACPI tables on x86 systems).
*
* We'll try to recover in such cases by pruning out the illegal
* groupings from the PG hierarchy, which means that we won't optimize
* for those levels, but we will for the remaining ones.
*/
switch (cmt_lineage_status) {
case CMT_LINEAGE_VALID:
case CMT_LINEAGE_REPAIRED:
break;
/*
* We've detected a PG whose CPUs span lgroups.
*
* This isn't supported, as the dispatcher isn't allowed to
* to do CMT thread placement across lgroups, as this would
* conflict with policies implementing MPO thread affinity.
*
* If the PG is of a sharing relationship type known to
* legitimately span lgroups, specify that no CMT thread
* placement policy should be implemented, and prune the PG
* from the existing CMT PG hierarchy.
*
* Otherwise, fall though to the case below for handling.
*/
goto revalidate;
}
}
/*LINTED*/
/*
* We've detected a PG that already exists in another CPU's
* lineage that cannot cannot legally be promoted into place
* without breaking the invariants of the hierarchy.
*/
goto revalidate;
}
}
/*
* Something went wrong trying to prune out the bad level.
* Disable CMT scheduling altogether.
*/
break;
/*
* We've detected a non-concentric PG lineage, which means that
* there's a PG in the lineage that has CPUs that the next PG
* over in the lineage (which is the same size or larger)
* doesn't have.
*
* In this case, we examine the two PGs to see if either
* grouping is defined by potentially buggy sources.
*
* If one has less CPUs than the other, and contains CPUs
* not found in the parent, and it is an untrusted enumeration,
* then prune it. If both have the same number of CPUs, then
* prune the one that is untrusted.
*
* This process repeats until we have a concentric lineage,
* or we would have to prune out level derived from what we
* thought was a reliable source, in which case CMT scheduling
* is disabled altogether.
*/
}
}
if (pg_bad) {
goto revalidate;
}
}
/*
* the bad level. Disable CMT scheduling altogether.
*/
break;
default:
/*
* If we're here, we've encountered a validation error for
* which we don't know how to recover. In this case, disable
* CMT scheduling altogether.
*/
}
return (cmt_lineage_status);
}