lgrp.c revision 611ffe8a3112495ac3288bbe1f81f9f09a61dc9e
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Basic NUMA support in terms of locality groups
*
* Solaris needs to know which CPUs, memory, etc. are near each other to
* provide good performance on NUMA machines by optimizing for locality.
* In order to do this, a new abstraction called a "locality group (lgroup)"
* has been introduced to keep track of which CPU-like and memory-like hardware
* resources are close to each other. Currently, latency is the only measure
* used to determine how to group hardware resources into lgroups, but this
* does not limit the groupings to be based solely on latency. Other factors
* may be used to determine the groupings in the future.
*
* Lgroups are organized into a hieararchy or topology that represents the
* latency topology of the machine. There is always at least a root lgroup in
* the system. It represents all the hardware resources in the machine at a
* latency big enough that any hardware resource can at least access any other
* hardware resource within that latency. A Uniform Memory Access (UMA)
* machine is represented with one lgroup (the root). In contrast, a NUMA
* machine is represented at least by the root lgroup and some number of leaf
* lgroups where the leaf lgroups contain the hardware resources within the
* least latency of each other and the root lgroup still contains all the
* resources in the machine. Some number of intermediate lgroups may exist
* which represent more levels of locality than just the local latency of the
* leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups
* (eg. root and intermediate lgroups) contain the next nearest resources to
* its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup
* to the root lgroup shows the hardware resources from closest to farthest
* from the leaf lgroup such that each successive ancestor lgroup contains
* the next nearest resources at the next level of locality from the previous.
*
* The kernel uses the lgroup abstraction to know how to allocate resources
* near a given process/thread. At fork() and lwp/thread_create() time, a
* "home" lgroup is chosen for a thread. This is done by picking the lgroup
* with the lowest load average. Binding to a processor or processor set will
* change the home lgroup for a thread. The scheduler has been modified to try
* to dispatch a thread on a CPU in its home lgroup. Physical memory
* allocation is lgroup aware too, so memory will be allocated from the current
* thread's home lgroup if possible. If the desired resources are not
* available, the kernel traverses the lgroup hierarchy going to the parent
* lgroup to find resources at the next level of locality until it reaches the
* root lgroup.
*/
#include <sys/lgrp_user.h>
#include <vm/seg_kmem.h>
#include <sys/sysmacros.h>
/* indexed by lgrp_id */
int nlgrps; /* number of lgroups in machine */
int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */
/*
* Kstat data for lgroups.
*
* Actual kstat data is collected in lgrp_stats array.
* The lgrp_kstat_data array of named kstats is used to extract data from
* lgrp_stats and present it to kstat framework. It is protected from partallel
* modifications by lgrp_kstat_mutex. This may cause some contention when
* several kstat commands run in parallel but this is not the
* performance-critical path.
*/
/*
* Declare kstat names statically for enums as defined in the header file.
*/
static void lgrp_kstat_init(void);
static int lgrp_kstat_extract(kstat_t *, int);
static void lgrp_kstat_reset(lgrp_id_t);
static kmutex_t lgrp_kstat_mutex;
/*
* max number of lgroups supported by the platform
*/
int nlgrpsmax = 0;
/*
* The root lgroup. Represents the set of resources at the system wide
* level of locality.
*/
/*
* During system bootstrap cp_default does not contain the list of lgrp load
* averages (cp_lgrploads). The list is allocated after the first CPU is brought
* on-line when cp_default is initialized by cpupart_initialize_default().
* Configuring CPU0 may create a two-level topology with root and one leaf node
* containing CPU0. This topology is initially constructed in a special
* statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
* to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
* for all lpl operations until cp_default is fully constructed.
*
* The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
* consumer who needs default lpl should use lpl_bootstrap which is a pointer to
* the first element of lpl_bootstrap_list.
*
* CPUs that are added to the system, but have not yet been assigned to an
* lgrp will use lpl_bootstrap as a default lpl. This is necessary because
* on some architectures (x86) it's possible for the slave CPU startup thread
* to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
*/
#define LPL_BOOTSTRAP_SIZE 2
/*
* If cp still references the bootstrap lpl, it has not yet been added to
* an lgrp. lgrp_mem_choose() uses this macro to detect the case where
* a thread is trying to allocate memory close to a CPU that has no lgrp.
*/
/*
* Size, in bytes, beyond which random memory allocation policy is applied
* to non-shared memory. Default is the maximum size, so random memory
* allocation won't be used for non-shared memory by default.
*/
/*
* Size, in bytes, beyond which random memory allocation policy is applied to
* shared memory. Default is 8MB (2 ISM pages).
*/
/*
* Whether to do processor set aware memory allocation by default
*/
int lgrp_mem_pset_aware = 0;
/*
* Set the default memory allocation policy for root lgroup
*/
/*
* Set the default memory allocation policy. For most platforms,
* next touch is sufficient, but some platforms may wish to override
* this.
*/
/*
* lgroup CPU event handlers
*/
static void lgrp_cpu_init(struct cpu *);
/*
* lgroup memory event handlers
*/
/*
* lgroup CPU partition event handlers
*/
static void lgrp_part_del_cpu(struct cpu *);
static void lgrp_root_init(void);
/*
* lpl topology
*/
static void lpl_verify_wrapper(struct cpupart *);
/*
* defines for lpl topology verifier return codes
*/
#define LPL_TOPO_CORRECT 0
#define LPL_TOPO_PART_HAS_NO_LPL -1
#define LPL_TOPO_CPUS_NOT_EMPTY -2
#define LPL_TOPO_LGRP_MISMATCH -3
#define LPL_TOPO_MISSING_PARENT -4
#define LPL_TOPO_PARENT_MISMATCH -5
#define LPL_TOPO_BAD_CPUCNT -6
#define LPL_TOPO_RSET_MISMATCH -7
#define LPL_TOPO_LPL_ORPHANED -8
#define LPL_TOPO_LPL_BAD_NCPU -9
#define LPL_TOPO_RSET_MSSNG_LF -10
#define LPL_TOPO_CPU_HAS_BAD_LPL -11
#define LPL_TOPO_BOGUS_HINT -12
#define LPL_TOPO_NONLEAF_HAS_CPUS -13
#define LPL_TOPO_LGRP_NOT_LEAF -14
#define LPL_TOPO_BAD_RSETCNT -15
/*
* Return whether lgroup optimizations should be enabled on this system
*/
int
lgrp_optimizations(void)
{
/*
* System must have more than 2 lgroups to enable lgroup optimizations
*
* XXX This assumes that a 2 lgroup system has an empty root lgroup
* with one child lgroup containing all the resources. A 2 lgroup
* system with a root lgroup directly containing CPUs or memory might
* need lgroup optimizations with its child lgroup, but there
* isn't such a machine for now....
*/
if (nlgrps > 2)
return (1);
return (0);
}
/*
* Build full lgroup topology
*/
static void
lgrp_root_init(void)
{
int i;
/*
* Create the "root" lgroup
*/
lgrp_root->lgrp_mnodes = 0;
lgrp_root->lgrp_nmnodes = 0;
hand = lgrp_plat_root_hand();
lgrp_root->lgrp_cpucnt = 0;
lgrp_root->lgrp_childcnt = 0;
lgrp_root->lgrp_chipcnt = 0;
for (i = 0; i < LGRP_RSRC_COUNT; i++)
/*
* Setup initial lpl list for CPU0 and initial t0 home.
* The only lpl space we have so far is lpl_bootstrap. It is used for
* all topology operations until cp_default is initialized at which
* point t0.t_lpl will be updated.
*/
}
/*
* Initialize the lgroup framework and allow the platform to do the same
*/
void
lgrp_init(void)
{
/*
* Initialize the platform
*/
/*
* Set max number of lgroups supported on this platform which must be
* less than the max number of lgroups supported by the common lgroup
* framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
*/
}
/*
* Create the root and cpu0's lgroup, and set t0's home.
*/
void
lgrp_setup(void)
{
/*
* Setup the root lgroup
*/
/*
* Add cpu0 to an lgroup
*/
}
/*
* Lgroup initialization is split in two parts. The first part
* (lgrp_main_init()) is called right before start_other_cpus() in main. The
* second part (lgrp_main_mp_init()) is called right after start_other_cpus()
* when all CPUs are brought online and all distance information is available.
*
* When lgrp_main_init() is complete it sets lgrp_initialized. The
* lgrp_main_mp_init() sets lgrp_topo_initialized.
*/
/*
* true when lgrp initialization has been completed.
*/
int lgrp_initialized = 0;
/*
* True when lgrp topology is constructed.
*/
int lgrp_topo_initialized = 0;
/*
* and cpu0 has been added to an lgroup.
*/
void
lgrp_main_init(void)
{
int i;
/*
* Enforce a valid lgrp_mem_default_policy
*/
if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
/*
* See if mpo should be disabled.
* This may happen in the case of null proc LPA on Starcat.
* The platform won't be able to detect null proc LPA until after
* cpu0 and memory have already been added to lgroups.
* When and if it is detected, the Starcat platform will return
* a different platform handle for cpu0 which is what we check for
* here. If mpo should be disabled move cpu0 to it's rightful place
* (the root), and destroy the remaining lgroups. This effectively
* provides an UMA lgroup topology.
*/
/*
* Destroy all lgroups except for root
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
if (LGRP_EXISTS(lgrp_table[i]) &&
lgrp_table[i] != lgrp_root)
lgrp_destroy(lgrp_table[i]);
}
/*
* Fix up root to point at itself for leaves and resources
* and not have any children
*/
lgrp_root->lgrp_childcnt = 0;
}
/*
* Initialize kstats framework.
*/
/*
* cpu0 is finally where it should be, so create it's lgroup's kstats
*/
lgrp_initialized = 1;
}
/*
* Finish lgrp initialization after all CPUS are brought on-line.
* This routine is called after start_other_cpus().
*/
void
lgrp_main_mp_init(void)
{
/*
* Update lgroup topology (if necessary)
*/
}
/*
* Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
*/
void
{
int rc;
switch (event) {
/*
* The following (re)configuration events are common code
* initiated. lgrp_plat_config() is called here to inform the
* platform of the reconfiguration event.
*/
case LGRP_CONFIG_CPU_ADD:
/*
* links, and give it a bootstrap lpl so that it can
* survive should it need to enter the dispatcher.
*/
break;
case LGRP_CONFIG_CPU_DEL:
break;
case LGRP_CONFIG_CPU_ONLINE:
if (rc != LPL_TOPO_CORRECT) {
}
break;
case LGRP_CONFIG_CPU_OFFLINE:
if (rc != LPL_TOPO_CORRECT) {
}
break;
case LGRP_CONFIG_CPUPART_ADD:
if (rc != LPL_TOPO_CORRECT) {
}
break;
case LGRP_CONFIG_CPUPART_DEL:
if (rc != LPL_TOPO_CORRECT) {
}
break;
/*
* The following events are initiated by the memnode
* subsystem.
*/
case LGRP_CONFIG_MEM_ADD:
break;
case LGRP_CONFIG_MEM_DEL:
break;
case LGRP_CONFIG_MEM_RENAME: {
lgrp_mem_rename((int)resource,
break;
}
case LGRP_CONFIG_GEN_UPDATE:
break;
case LGRP_CONFIG_FLATTEN:
if (where == 0)
lgrp_topo_levels = (int)resource;
else
(void) lgrp_topo_flatten(resource,
break;
/*
* Initiated by platform latency probing code
*/
break;
case LGRP_CONFIG_NOP:
break;
default:
break;
}
}
/*
* Called to add lgrp info into cpu structure from cpu_add_unit;
* do not assume cpu is in cpu[] yet!
*
* CPUs are brought online with all other CPUs paused so we can't
* allocate memory or we could deadlock the system, so we rely on
* the platform to statically allocate as much space as we need
* for the lgrp structs and stats.
*/
static void
{
int count;
int first_cpu;
/*
* This is the first time through if the resource set
* for the root lgroup is empty. After cpu0 has been
* initially added to an lgroup, the root's CPU resource
* set can never be empty, since the system's last CPU
* cannot be offlined.
*/
/*
* First time through.
*/
first_cpu = 1;
} else {
/*
* If cpu0 needs to move lgroups, we may come
* through here again, at which time cpu_lock won't
* be held, and lgrp_initialized will be false.
*/
first_cpu = 0;
}
/*
* Create new lgrp and add it to lgroup topology
*/
my_lgrp = lgrp_create();
count = 0;
&changed);
/*
* May have added new intermediate lgroups, so need to add
* resources other than CPUs which are added below
*/
> 0) {
/*
* Leaf lgroup was created, but latency wasn't available
* then. So, set latency for it and fill in rest of lgroup
* topology now that we know how far it is from other leaf
* lgroups.
*/
lgrpid))
&changed);
/*
* May have added new intermediate lgroups, so need to add
* resources other than CPUs which are added below
*/
int i;
/*
* Update existing lgroup and lgroups containing it with CPU
* resource
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
if (!LGRP_EXISTS(lgrp) ||
continue;
}
}
/*
* For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
* end up in lpl for lgroup 0 whether it is supposed to be in there or
* not since none of lgroup IDs in the lpl's have been set yet.
*/
/*
* link the CPU into the lgrp's CPU list
*/
if (my_lgrp->lgrp_cpucnt == 0) {
} else {
}
my_lgrp->lgrp_cpucnt++;
/*
* Add this cpu's chip to the per lgroup list
* if necessary
*/
if (my_lgrp->lgrp_chipcnt == 0) {
} else {
chp;
}
my_lgrp->lgrp_chipcnt++;
}
}
lgrp_t *
lgrp_create(void)
{
int i;
/*
* Find an open slot in the lgroup table and recycle unused lgroup
* left there if any
*/
if (lgrp_alloc_hint == -1)
/*
* Allocate from end when hint not set yet because no lgroups
* have been deleted yet
*/
else {
/*
* Start looking for next open slot from hint and leave hint
* at slot allocated
*/
for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
my_lgrp = lgrp_table[i];
if (!LGRP_EXISTS(my_lgrp)) {
lgrpid = i;
nlgrps++;
break;
}
}
}
/*
* Keep track of max lgroup ID allocated so far to cut down on searches
*/
if (lgrpid > lgrp_alloc_max)
/*
* Need to allocate new lgroup if next open slot didn't have one
* for recycling
*/
my_lgrp->lgrp_latency = 0;
my_lgrp->lgrp_childcnt = 0;
my_lgrp->lgrp_nmnodes = 0;
for (i = 0; i < LGRP_RSRC_COUNT; i++)
my_lgrp->lgrp_cpucnt = 0;
my_lgrp->lgrp_chipcnt = 0;
return (my_lgrp);
}
void
{
int i;
/*
* Unless this lgroup is being destroyed on behalf of
* the boot CPU, cpu_lock must be held
*/
if (nlgrps == 1)
if (!LGRP_EXISTS(lgrp))
return;
/*
* Set hint to lgroup being deleted and try to keep lower numbered
* hints to facilitate finding empty slots
*/
/*
* Mark this lgroup to be recycled by setting its lgroup ID to
* LGRP_NONE and clear relevant fields
*/
lgrp->lgrp_latency = 0;
lgrp->lgrp_childcnt = 0;
for (i = 0; i < LGRP_RSRC_COUNT; i++)
lgrp->lgrp_nmnodes = 0;
lgrp->lgrp_cpucnt = 0;
lgrp->lgrp_chipcnt = 0;
nlgrps--;
}
/*
* Initialize kstat data. Called from lgrp intialization code.
*/
static void
lgrp_kstat_init(void)
{
}
/*
* initialize an lgrp's kstats if needed
* called with cpu_lock held but not with cpus paused.
* we don't tear these down now because we don't know about
* memory leaving the lgrp yet...
*/
void
{
return; /* already initialized */
if (lgrp_kstat != NULL) {
}
}
/*
* this will do something when we manage to remove now unused lgrps
*/
/* ARGSUSED */
void
{
}
/*
* Called when a CPU is off-lined.
*/
static void
{
/*
* just because I'm paranoid doesn't mean...
*/
my_lgrp->lgrp_cpucnt--;
/*
* If the last CPU on it's chip is being offlined
* then remove this chip from the per lgroup list.
*
* This is also done for the boot CPU when it needs
* to move between lgroups as a consequence of
* null proc lpa.
*/
if (--my_lgrp->lgrp_chipcnt == 0)
/*
* Walk this lgroup's chip list looking for chips that
* may try to balance against the one that's leaving
*/
}
}
/*
* Removing last CPU in lgroup, so update lgroup topology
*/
if (my_lgrp->lgrp_cpucnt == 0) {
int count;
int i;
/*
* Remove this lgroup from its lgroup CPU resources and remove
* lgroup from lgroup topology if it doesn't have any more
* resources in it now
*/
count = 0;
return;
}
/*
* This lgroup isn't empty, so just remove it from CPU
* resources of any lgroups that contain it as such
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
if (!LGRP_EXISTS(lgrp) ||
lgrpid))
continue;
}
return;
}
}
/*
* Update memory nodes in target lgroups and return ones that get changed
*/
int
{
int count;
int i;
int j;
count = 0;
if (changed)
if (klgrpset_isempty(target))
return (0);
/*
* Find each lgroup in target lgroups
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
/*
* Skip any lgroups that don't exist or aren't in target group
*/
lgrp = lgrp_table[i];
continue;
}
/*
* Initialize memnodes for intermediate lgroups to 0
* and update them from scratch since they may have completely
* changed
*/
lgrp->lgrp_nmnodes = 0;
}
/*
* Update memory nodes of of target lgroup with memory nodes
* from each lgroup in its lgroup memory resource set
*/
for (j = 0; j <= lgrp_alloc_max; j++) {
int k;
/*
* Skip any lgroups that don't exist or aren't in
* memory resources of target lgroup
*/
lgrp_rsrc = lgrp_table[j];
if (!LGRP_EXISTS(lgrp_rsrc) ||
j))
continue;
/*
* Update target lgroup's memnodes to include memnodes
* of this lgroup
*/
for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
lgrp->lgrp_nmnodes++;
}
}
count++;
if (changed)
}
}
return (count);
}
/*
* Memory copy-rename. Called when the "mnode" containing the kernel cage memory
* is moved from one board to another. The "from" and "to" arguments specify the
* source and the destination of the move.
*
* See plat_lgrp_config() for a detailed description of the copy-rename
* semantics.
*
* The lgrp_mem_rename() is called by the platform copy-rename code to update
* the lgroup topology which is changing as memory moves from one lgroup to
* another. It removes the mnode from the source lgroup and re-inserts it in the
* target lgroup.
*
* The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
* lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
* copy-rename operation.
*
* There is one case which requires special handling. If the system contains
* only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
* lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
* lgrp_mem_init), but there is a window when the system has no memory in the
* lgroup hierarchy. If another thread tries to allocate memory during this
* window, the allocation will fail, although the system has physical memory.
* This may cause a system panic or a deadlock (some sleeping memory allocations
* happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
* the mnode back).
*
* The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
* lgrp with non-empty lgrp_mnodes. To deal with the special case above,
* lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
* but it updates the rest of the lgroup topology as if the mnode was actually
* removed. The lgrp_mem_init() function recognizes that the mnode being
* inserted represents such a special case and updates the topology
* appropriately.
*/
void
{
/*
* Remove the memory from the source node and add it to the destination
* node.
*/
}
/*
* Called to indicate that the lgrp with platform handle "hand" now
* contains the memory identified by "mnode".
*
* LOCKING for this routine is a bit tricky. Usually it is called without
* cpu_lock and it must must grab cpu_lock here to prevent racing with other
* callers. During DR of the board containing the caged memory it may be called
* with cpu_lock already held and CPUs paused.
*
* If the insertion is part of the DR copy-rename and the inserted mnode (and
* only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
* dealing with the special case of DR copy-rename described in
* lgrp_mem_rename().
*/
void
{
int count;
int i;
/*
* Grab CPU lock (if we haven't already)
*/
if (!MUTEX_HELD(&cpu_lock)) {
}
/*
* This routine may be called from a context where we already
* hold cpu_lock, and have already paused cpus.
*/
if (!cpus_paused())
need_synch = B_TRUE;
/*
* Check if this mnode is already configured and return immediately if
* it is.
*
* NOTE: in special case of copy-rename of the only remaining mnode,
* lgrp_mem_fini() refuses to remove the last mnode from the root, so we
* recognize this case and continue as usual, but skip the update to
* the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
* in topology, temporarily introduced by lgrp_mem_fini().
*/
if (drop_lock)
return;
}
/*
* Update lgroup topology with new memory resources, keeping track of
* which lgroups change
*/
count = 0;
/* new lgrp */
my_lgrp = lgrp_create();
if (need_synch)
&changed);
if (need_synch)
start_cpus();
> 0) {
/*
* Leaf lgroup was created, but latency wasn't available
* then. So, set latency for it and fill in rest of lgroup
* topology now that we know how far it is from other leaf
* lgroups.
*/
lgrpid))
if (need_synch)
&changed);
if (need_synch)
start_cpus();
/*
* Add new lgroup memory resource to existing lgroup
*/
count++;
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
if (!LGRP_EXISTS(lgrp) ||
continue;
count++;
}
}
/*
* Add memory node to lgroup and remove lgroup from ones that need
* to be updated
*/
my_lgrp->lgrp_nmnodes++;
}
/*
* Update memory node information for all lgroups that changed and
* contain new memory node as a resource
*/
if (count)
if (drop_lock)
}
/*
* Called to indicate that the lgroup associated with the platform
* handle "hand" no longer contains given memory node
*
* LOCKING for this routine is a bit tricky. Usually it is called without
* cpu_lock and it must must grab cpu_lock here to prevent racing with other
* callers. During DR of the board containing the caged memory it may be called
* with cpu_lock already held and CPUs paused.
*
* If the deletion is part of the DR copy-rename and the deleted mnode is the
* only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
* but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
* the same mnode back into the topology. See lgrp_mem_rename() and
* lgrp_mem_init() for additional details.
*/
void
{
int count;
int i;
/*
* Grab CPU lock (if we haven't already)
*/
if (!MUTEX_HELD(&cpu_lock)) {
}
/*
* This routine may be called from a context where we already
* hold cpu_lock and have already paused cpus.
*/
if (!cpus_paused())
need_synch = B_TRUE;
/*
* The lgrp *must* be pre-existing
*/
/*
* Delete memory node from lgroups which contain it
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
/*
* Skip any non-existent lgroups and any lgroups that don't
* contain leaf lgroup of memory as a memory resource
*/
if (!LGRP_EXISTS(lgrp) ||
continue;
/*
* Avoid removing the last mnode from the root in the DR
* copy-rename case. See lgrp_mem_rename() for details.
*/
if (is_copy_rename &&
continue;
/*
* Remove memory node from lgroup.
*/
lgrp->lgrp_nmnodes--;
}
/*
* Don't need to update lgroup topology if this lgroup still has memory.
*
* In the special case of DR copy-rename with the only mnode being
* removed, the lgrp_mnodes for the root is always non-zero, but we
* still need to update the lgroup topology.
*/
if ((my_lgrp->lgrp_nmnodes > 0) &&
!(is_copy_rename &&
if (drop_lock)
return;
}
/*
* This lgroup does not contain any memory now
*/
/*
* Remove this lgroup from lgroup topology if it does not contain any
* resources now
*/
count = 0;
/*
* Delete lgroup when no more resources
*/
if (need_synch)
if (need_synch)
start_cpus();
} else {
/*
* Remove lgroup from memory resources of any lgroups that
* contain it as such
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
if (!LGRP_EXISTS(lgrp) ||
lgrpid))
continue;
}
}
if (drop_lock)
}
/*
* Return lgroup with given platform handle
*/
lgrp_t *
{
int i;
if (hand == LGRP_NULL_HANDLE)
return (NULL);
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
return (lgrp);
}
return (NULL);
}
/*
* Return the home lgroup of the current thread.
* We must do this with kernel preemption disabled, since we don't want our
* thread to be re-homed while we're poking around with its lpl, and the lpl
* should never be NULL.
*
* NOTE: Can't guarantee that lgroup will be valid once kernel preemption
* is enabled because of DR. Callers can use disable kernel preemption
* around this call to guarantee that the lgroup will be valid beyond this
* routine, since kernel preemption can be recursive.
*/
lgrp_t *
lgrp_home_lgrp(void)
{
return (lgrp);
}
/*
* Return ID of home lgroup for given thread
* (See comments for lgrp_home_lgrp() for special care and handling
* instructions)
*/
{
/*
* We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
* cannot since the HAT layer can call into this routine to
* determine the locality for its data structures in the context
* of a page fault.
*/
return (lgrp);
}
/*
* Return lgroup containing the physical memory for the given page frame number
*/
lgrp_t *
{
int i;
if (hand != LGRP_NULL_HANDLE)
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
return (lgrp);
}
return (NULL);
}
/*
* Return lgroup containing the physical memory for the given page frame number
*/
lgrp_t *
{
int i;
if (hand != LGRP_NULL_HANDLE)
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
return (lgrp);
}
return (NULL);
}
/*
* Return the leaf lgroup containing the given CPU
*
* The caller needs to take precautions necessary to prevent
* "cpu" from going away across a call to this function.
* hint: kpreempt_disable()/kpreempt_enable()
*/
static lgrp_t *
{
}
/*
* Return the sum of the partition loads in an lgrp divided by
* the number of CPUs in the lgrp. This is our best approximation
* of an 'lgroup load average' for a useful per-lgroup kstat.
*/
static uint64_t
{
int ncpu;
return (0ull);
}
do {
}
void
{
struct lgrp_stats *pstats;
/*
* Verify that the caller isn't trying to add to
* a statistic for an lgroup that has gone away
*/
return;
}
{
struct lgrp_stats *pstats;
return ((int64_t)0);
return (val);
}
/*
* Reset all kstats for lgrp specified by its lgrpid.
*/
static void
{
return;
}
}
/*
* Collect all per-lgrp statistics for the lgrp associated with this
* kstat, and store them in the ks_data array.
*
* The superuser can reset all the running counter statistics for an
* lgrp by writing to any of the lgrp's stats.
*/
static int
{
struct kstat_named *ksd;
/*
* Return all zeroes as stats for freed lgrp.
*/
}
} else if (rw != KSTAT_WRITE) {
/*
* Handle counter stats
*/
}
/*
* Handle kernel data snapshot stats
*/
} else {
}
return (0);
}
int
{
return (EINVAL);
}
return (EINVAL);
}
return (0);
}
int
{
return (EINVAL);
}
return (0);
}
void
{
int i;
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
}
}
/*
* Add a resource named by lpl_leaf to rset of lpl_target
*
* This routine also adjusts ncpu and nrset if the call succeeds in adding a
* resource. It is adjusted here, as this is presently the only place that we
* can be certain a resource addition has succeeded.
*
* We keep the list of rsets sorted so that the dispatcher can quickly walk the
* list in order until it reaches a NULL. (This list is required to be NULL
* terminated, too). This is done so that we can mark start pos + 1, so that
* each lpl is traversed sequentially, but in a different order. We hope this
* will improve performance a bit. (Hopefully, less read-to-own traffic...)
*/
void
{
int i;
int entry_slot = 0;
/* return if leaf is already present */
for (i = 0; i < lpl_target->lpl_nrset; i++) {
return;
}
lpl_leaf->lpl_lgrpid) {
break;
}
}
/* insert leaf, update counts */
entry_slot = i;
i = lpl_target->lpl_nrset++;
panic("More leaf lgrps in system than are supported!\n");
}
/*
* Start at the end of the rset array and work backwards towards the
* slot into which the new lpl will be inserted. This effectively
* preserves the current ordering by scooting everybody over one entry,
* and placing the new entry into the space created.
*/
while (i-- > entry_slot) {
}
}
/*
* Update each of lpl_parent's children with a proper hint and
* a reference to their parent.
* The lgrp topology is used as the reference since it is fully
* consistent and correct at this point.
*
* Each child's hint will reference an element in lpl_parent's
* rset that designates where the child should start searching
* for CPU resources. The hint selected is the highest order leaf present
* in the child's lineage.
*
* This should be called after any potential change in lpl_parent's
* rset.
*/
static void
{
int hint;
int i, j;
if (klgrpset_isempty(children))
return; /* nothing to do */
for (i = 0; i <= lgrp_alloc_max; i++) {
if (klgrpset_ismember(children, i)) {
/*
* Given the set of leaves in this child's lineage,
* find the highest order leaf present in the parent's
* rset. Select this as the hint for the child.
*/
hint = 0;
for (j = 0; j < lpl_parent->lpl_nrset; j++) {
hint = j;
}
/*
* (Re)set the parent. It may be incorrect if
* lpl_parent is new in the topology.
*/
}
}
}
/*
* Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
*
* This routine also adjusts ncpu and nrset if the call succeeds in deleting a
* resource. The values are adjusted here, as this is the only place that we can
* be certain a resource was successfully deleted.
*/
void
{
int i;
/* find leaf in intermediate node */
for (i = 0; i < lpl_target->lpl_nrset; i++) {
break;
}
/* return if leaf not found */
return;
/* prune leaf, compress array */
lpl_target->lpl_ncpu--;
do {
} while (i++ < lpl_target->lpl_nrset);
}
/*
* Check to see if the resource set of the target lpl contains the
* supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not.
*/
int
{
int i;
for (i = 0; i < lpl_target->lpl_nrset; i++) {
return (1);
}
return (0);
}
/*
* Called when we change cpu lpl membership. This increments or decrements the
* per-cpu counter in every lpl in which our leaf appears.
*/
void
{
int i;
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp_cur = lgrp_table[i];
/*
* Don't adjust if the lgrp isn't there, if we're the leaf lpl
* for the cpu in question, or if the current lgrp and leaf
* don't share the same resources.
*/
continue;
if (act == LPL_INCREMENT) {
} else if (act == LPL_DECREMENT) {
}
}
}
}
/*
* Initialize lpl with given resources and specified lgrp
*/
void
{
lpl->lpl_loadavg = 0;
else
}
/*
* Clear an unused lpl
*/
void
{
/* save lid for debugging purposes */
}
/*
* Given a CPU-partition, verify that the lpl topology in the CPU-partition
* is in sync with the lgroup toplogy in the system. The lpl topology may not
* make full use of all of the lgroup topology, but this checks to make sure
* that for the parts that it does use, it has correctly understood the
* relationships that exist. This function returns
* 0 if the topology is correct, and a non-zero error code, for non-debug
* kernels if incorrect. Asserts are spread throughout the code to aid in
* debugging on a DEBUG kernel.
*/
int
{
int i;
int j;
int sum;
/* topology can't be incorrect if it doesn't exist */
if (!lgrp_topo_initialized || !lgrp_initialized)
return (LPL_TOPO_CORRECT);
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp = lgrp_table[i];
/* make sure lpls are allocated */
if (!cpupart->cp_lgrploads)
return (LPL_TOPO_PART_HAS_NO_LPL);
/* make sure our index is good */
/* if lgroup doesn't exist, make sure lpl is empty */
if (!LGRP_EXISTS(lgrp)) {
return (LPL_TOPO_CPUS_NOT_EMPTY);
} else {
continue;
}
}
/* verify that lgroup and lpl are identically numbered */
/* if lgroup isn't in our partition, make sure lpl is empty */
cpupart->cp_lgrpset)) {
return (LPL_TOPO_CPUS_NOT_EMPTY);
}
/*
* lpl is empty, and lgroup isn't in partition. verify
* that lpl doesn't show up in anyone else's rsets (in
* this partition, anyway)
*/
for (j = 0; j < cpupart->cp_nlgrploads; j++) {
return (LPL_TOPO_LPL_ORPHANED);
}
}
/* lgroup is empty, and everything is ok. continue */
continue;
}
/* lgroup is in this partition, now check it against lpl */
/* do both have matching lgrps? */
return (LPL_TOPO_LGRP_MISMATCH);
}
/* do the parent lgroups exist and do they match? */
if (lgrp->lgrp_parent) {
if (!lpl->lpl_parent) {
return (LPL_TOPO_MISSING_PARENT);
return (LPL_TOPO_PARENT_MISMATCH);
}
}
/* only leaf lgroups keep a cpucnt, only check leaves */
/* verify that lgrp is also a leaf */
lpl->lpl_lgrpid)));
if ((lgrp->lgrp_childcnt > 0) ||
lpl->lpl_lgrpid))) {
return (LPL_TOPO_LGRP_NOT_LEAF);
}
return (LPL_TOPO_BAD_CPUCNT);
}
/*
* Check that lpl_ncpu also matches the number of
* cpus in the lpl's linked list. This only exists in
* leaves, but they should always match.
*/
j = 0;
j++;
/* check to make sure cpu's lpl is leaf lpl */
return (LPL_TOPO_CPU_HAS_BAD_LPL);
}
/* check next cpu */
continue;
} else {
}
}
return (LPL_TOPO_LPL_BAD_NCPU);
}
/*
* Also, check that leaf lpl is contained in all
* intermediate lpls that name the leaf as a descendant
*/
for (j = 0; j <= lgrp_alloc_max; j++) {
lgrp_cand = lgrp_table[j];
if (!LGRP_EXISTS(lgrp_cand) ||
cpupart->cp_lgrpset) ||
(intersect == 0))
continue;
lpl_cand =
lpl));
return (LPL_TOPO_RSET_MSSNG_LF);
}
}
}
} else { /* non-leaf specific checks */
/*
* Non-leaf lpls should have lpl_cpus == NULL
* verify that this is so
*/
return (LPL_TOPO_NONLEAF_HAS_CPUS);
}
/*
* verify that the sum of the cpus in the leaf resources
* is equal to the total ncpu in the intermediate
*/
}
return (LPL_TOPO_LPL_BAD_NCPU);
}
}
/*
* check on lpl_hint. Don't check root, since it has no parent.
*/
int hint;
/* make sure hint is within limits of nrset */
return (LPL_TOPO_BOGUS_HINT);
}
/* make sure hint points to valid lpl */
return (LPL_TOPO_BOGUS_HINT);
}
}
/*
* Check the rset of the lpl in question. Make sure that each
* rset contains a subset of the resources in
* lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes
* sure that each rset doesn't include resources that are
* outside of that set. (Which would be resources somehow not
* accounted for).
*/
}
/* make sure lpl rset matches lgrp rset */
/* make sure rset is contained with in partition, too */
if (!klgrpset_isempty(rset) ||
!klgrpset_isempty(cset)) {
return (LPL_TOPO_RSET_MISMATCH);
}
/*
* check to make sure lpl_nrset matches the number of rsets
* contained in the lpl
*/
j++);
return (LPL_TOPO_BAD_RSETCNT);
}
}
return (LPL_TOPO_CORRECT);
}
/*
* Flatten lpl topology to given number of levels. This is presently only
* implemented for a flatten to 2 levels, which will prune out the intermediates
* and home the leaf lpls to the root lpl.
*/
int
lpl_topo_flatten(int levels)
{
int i;
if (levels != 2)
return (0);
/* called w/ cpus paused - grab no locks! */
cp = cp_list_head;
do {
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp_cur = lgrp_table[i];
(!LGRP_EXISTS(lgrp_cur) &&
continue;
/*
* this should be a deleted intermediate, so
* clear it
*/
/*
* this is a leaf whose parent was deleted, or
* whose parent had their lgrp deleted. (And
* whose parent will soon be deleted). Point
* this guy back to the root lpl.
*/
}
}
/*
* Now that we're done, make sure the count on the root lpl is
* correct, and update the hints of the children for the sake of
* thoroughness
*/
}
} while (cp != cp_list_head);
return (levels);
}
/*
* Insert a lpl into the resource hierarchy and create any additional lpls that
* are necessary to represent the varying states of locality for the cpu
* resoruces newly added to the partition.
*
* This routine is clever enough that it can correctly add resources from the
* new leaf into both direct and indirect resource sets in the hierarchy. (Ie,
* those for which the lpl is a leaf as opposed to simply a named equally local
* resource). The one special case that needs additional processing is when a
* new intermediate lpl is introduced. Since the main loop only traverses
* looking to add the leaf resource where it does not yet exist, additional work
* is necessary to add other leaf resources that may need to exist in the newly
* created intermediate. This is performed by the second inner loop, and is
* only done when the check for more than one overlapping resource succeeds.
*/
void
{
int i;
int j;
int hint;
int rset_num_intersect;
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp_cur = lgrp_table[i];
/*
* Don't insert if the lgrp isn't there, if the leaf isn't
* contained within the current lgrp, or if the current lgrp has
* no leaves in this partition
*/
if (!LGRP_EXISTS(lgrp_cur) ||
lpl_leaf->lpl_lgrpid) ||
continue;
/* if lgrp has a parent, assign it properly */
} else {
/* if not, make sure parent ptr gets set to null */
lpl_parent = NULL;
}
/*
* Almost all leaf state was initialized elsewhere. The
* only thing left to do is to set the parent.
*/
continue;
}
/*
* Initialize intermediate lpl
* Save this lpl's hint though. Since we're changing this
* lpl's resources, we need to update the hint in this lpl's
* children, but the hint in this lpl is unaffected and
* should be preserved.
*/
/* does new lpl need to be populated with other resources? */
if (rset_num_intersect > 1) {
/*
* If so, figure out what lpls have resources that
* intersect this one, and add them.
*/
for (j = 0; j <= lgrp_alloc_max; j++) {
lgrp_cand = lgrp_table[j];
if (!LGRP_EXISTS(lgrp_cand) ||
continue;
lpl_cand =
}
}
/*
* This lpl's rset has changed. Update the hint in it's
* children.
*/
}
}
/*
* remove a lpl from the hierarchy of resources, clearing its state when
* finished. If the lpls at the intermediate levels of the hierarchy have no
* remaining resources, or no longer name a leaf resource in the cpu-partition,
* delete them as well.
*/
void
{
int i;
for (i = 0; i <= lgrp_alloc_max; i++) {
lgrp_cur = lgrp_table[i];
/*
* Don't attempt to remove from lgrps that aren't there, that
* don't contain our leaf, or from the leaf itself. (We do that
* later)
*/
if (!LGRP_EXISTS(lgrp_cur))
continue;
lpl_leaf->lpl_lgrpid) ||
continue;
}
/*
* This is a slightly sleazy simplification in that we have
* already marked the cp_lgrpset as no longer containing the
* leaf we've deleted. Any lpls that pass the above checks
* based upon lgrp membership but not necessarily cpu-part
* membership also get cleared by the checks below. Currently
* this is harmless, as the lpls should be empty anyway.
*
* In particular, we want to preserve lpls that have additional
* leaf resources, even though we don't yet have a processor
* architecture that represents resources this way.
*/
} else {
/*
* Update this lpl's children
*/
}
}
}
/*
* add a cpu to a partition in terms of lgrp load avg bookeeping
*
* The lpl (cpu partition load average information) is now arranged in a
* hierarchical fashion whereby resources that are closest, ie. most local, to
* the cpu in question are considered to be leaves in a tree of resources.
* There are two general cases for cpu additon:
*
* 1. A lpl structure that contains resources already in the hierarchy tree.
* In this case, all of the associated lpl relationships have been defined, and
* all that is necessary is that we link the new cpu into the per-lpl list of
* cpus, and increment the ncpu count of all places where this cpu resource will
* be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
* pushing is accomplished by this routine.
*
* 2. The lpl to contain the resources in this cpu-partition for this lgrp does
* not exist yet. In this case, it is necessary to build the leaf lpl, and
* construct the hierarchy of state necessary to name it's more distant
* resources, if they should exist. The leaf structure is initialized by this
* routine, as is the cpu-partition state for the lgrp membership. This routine
* also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
* and builds all of the "ancestoral" state necessary to identify resources at
* differing levels of locality.
*/
void
{
/* called sometimes w/ cpus paused - grab no locks */
/* don't add non-existent lgrp */
/* only leaf lpls contain cpus */
} else {
/*
* the lpl should already exist in the parent, so just update
* the count of available CPUs
*/
}
/* link cpu into list of cpus in lpl */
} else {
/*
* We increment ncpu immediately after we create a new leaf
* lpl, so assert that ncpu == 1 for the case where we don't
* have any cpu pointers yet.
*/
}
}
/*
* remove a cpu from a partition in terms of lgrp load avg bookeeping
*
* The lpl (cpu partition load average information) is now arranged in a
* hierarchical fashion whereby resources that are closest, ie. most local, to
* the cpu in question are considered to be leaves in a tree of resources.
* There are two removal cases in question:
*
* 1. Removal of the resource in the leaf leaves other resources remaining in
* that leaf. (Another cpu still exists at this level of locality). In this
* case, the count of available cpus is decremented in all assocated lpls by
* calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
* from the per-cpu lpl list.
*
* 2. Removal of the resource results in the lpl containing no resources. (It's
* empty) In this case, all of what has occurred for the first step must take
* place; however, additionally we must remove the lpl structure itself, prune
* out any stranded lpls that do not directly name a leaf resource, and mark the
* cpu partition in question as no longer containing resources from the lgrp of
* the lpl that has been delted. Cpu-partition changes are handled by this
* method, but the lpl_leaf_remove function deals with the details of pruning
* out the empty lpl and any of its orphaned direct ancestors.
*/
void
{
/* called sometimes w/ cpus paused - grab no locks */
/* don't delete a leaf that isn't there */
/* no double-deletes */
/*
* This was the last cpu in this lgroup for this partition,
* clear its bit in the partition's lgroup bitmask
*/
/* eliminate remaning lpl link pointers in cpu, lpl */
} else {
/* unlink cpu from lists of cpus in lpl */
}
/*
* Update the cpu count in the lpls associated with parent
* lgroups.
*/
}
/* clear cpu's lpl ptr when we're all done */
}
/*
*
* We rely on the fact that this routine is called from the clock thread
* at a point before the clock thread can block (i.e. before its first
* lock request). Since the clock thread can not be preempted (since it
* runs at highest priority), we know that cpu partitions can not change
* (since doing so would require either the repartition requester or the
* cpu_pause thread to run on this cpu), so we can update the cpu's load
* without grabbing cpu_lock.
*/
void
{
/*
* 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
*/
static short expval[] = {
0, 3196, 1618, 1083,
814, 652, 543, 466,
408, 363, 326, 297,
272, 251, 233, 218,
204, 192, 181, 172,
163, 155, 148, 142,
136, 130, 125, 121,
116, 112, 109, 105
};
/* ASSERT (called from clock level) */
return;
}
for (;;) {
else
/*
* Modify the load average atomically to avoid losing
* anticipatory load updates (see lgrp_move_thread()).
*/
if (ageflag) {
/*
* We're supposed to both update and age the load.
* little hoop-jumping to avoid integer overflow.
*/
int64_t q, r;
do {
((r * f) >> 16)) >> 7;
/*
* Check for overflow
*/
if (new > LGRP_LOADAVG_MAX)
else if (new < 0)
new = 0;
} else {
/*
* We're supposed to update the load, but not age it.
* This option is used to update the load (which either
* has already been aged in this 1/10 sec. interval or
* soon will be) to account for a remotely executing
* thread.
*/
do {
new += f;
/*
* Check for overflow
* Underflow not possible here
*/
}
/*
* Do the same for this lpl's parent
*/
break;
}
}
/*
* Initialize lpl topology in the target based on topology currently present in
* lpl_bootstrap.
*
* lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
* initialize cp_default list of lpls. Up to this point all topology operations
* were performed using lpl_bootstrap. Now cp_default has its own list of lpls
* and all subsequent lpl operations should use it instead of lpl_bootstrap. The
* `target' points to the list of lpls in cp_default and `size' is the size of
* this list.
*
* This function walks the lpl topology in lpl_bootstrap and does for things:
*
* 1) Copies all fields from lpl_bootstrap to the target.
*
* 2) Sets CPU0 lpl pointer to the correct element of the target list.
*
* 3) Updates lpl_parent pointers to point to the lpls in the target list
* instead of lpl_bootstrap.
*
* 4) Updates pointers in the resource list of the target to point to the lpls
* in the target list instead of lpl_bootstrap.
*
* After lpl_topo_bootstrap() completes, target contains the same information
* that would be present there if it were used during boot instead of
* lpl_bootstrap. There is no need in information in lpl_bootstrap after this
* and it is bzeroed.
*/
void
{
int howmany;
int id;
int i;
/*
* The only target that should be passed here is cp_default lpl list.
*/
/*
* Copy all fields from lpl.
*/
*target_lpl = *lpl;
/*
* Substitute CPU0 lpl pointer with one relative to target.
*/
}
/*
* Substitute parent information with parent relative to target.
*/
/*
* Walk over resource set substituting pointers relative to
* lpl_bootstrap to pointers relative to target.
*/
(lpl_t *)
}
}
}
/*
* Topology information in lpl_bootstrap is no longer needed.
*/
}
/* the maximum effect that a single thread can have on it's lgroup's load */
#define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
((lgrp_loadavg_max_effect) / (ncpu))
/*
* If the lowest load among the lgroups a process' threads are currently
* spread across is greater than lgrp_expand_proc_thresh, we'll consider
* expanding the process to a new lgroup.
*/
#define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
#define LGRP_EXPAND_PROC_THRESH(ncpu) \
((lgrp_expand_proc_thresh) / (ncpu))
/*
* A process will be expanded to a new lgroup only if the difference between
* the lowest load on the lgroups the process' thread's are currently spread
* across and the lowest load on the other lgroups in the process' partition
* is greater than lgrp_expand_proc_diff.
*/
#define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
#define LGRP_EXPAND_PROC_DIFF(ncpu) \
((lgrp_expand_proc_diff) / (ncpu))
/*
* The loadavg tolerance accounts for "noise" inherent in the load, which may
* be present due to impreciseness of the load average decay algorithm.
*
* The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
* tolerance is scaled by the number of cpus in the lgroup just like
* lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
* and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
* of: 0x10000 / 4 => 0x4000 or greater to be significant.
*/
#define LGRP_LOADAVG_TOLERANCE(ncpu) \
((lgrp_loadavg_tolerance) / ncpu)
/*
* lgrp_choose() will choose root lgroup as home when lowest lgroup load
* average is above this threshold
*/
/*
* lgrp_choose() will try to skip any lgroups with less memory
* than this free when choosing a home lgroup
*/
/*
* When choosing between similarly loaded lgroups, lgrp_choose() will pick
* one based on one of the following policies:
* - Random selection
* - Pseudo round robin placement
* - Longest time since a thread was last placed
*/
#define LGRP_CHOOSE_RANDOM 1
#define LGRP_CHOOSE_RR 2
#define LGRP_CHOOSE_TIME 3
/*
* Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to
* be bound to a CPU or processor set.
*
* Arguments:
* t The thread
* cpupart The partition the thread belongs to.
*
* NOTE: Should at least be called with the cpu_lock held, kernel preemption
* disabled, or thread_lock held (at splhigh) to protect against the CPU
* partitions changing out from under us and assumes that given thread is
* protected. Also, called sometimes w/ cpus paused or kernel preemption
* disabled, so don't grab any locks because we should never block under
* those conditions.
*/
lpl_t *
{
int lgrpid_offset, lgrp_count;
proc_t *p;
THREAD_LOCK_HELD(t));
p = t->t_procp;
/* A process should always be in an active partition */
switch (lgrp_choose_policy) {
case LGRP_CHOOSE_RR:
do {
if (++lgrpid > lgrp_alloc_max)
lgrpid = 0;
break;
default:
case LGRP_CHOOSE_TIME:
case LGRP_CHOOSE_RANDOM:
if (--lgrpid_offset == 0)
break;
}
}
break;
}
/*
* Use lgroup affinities (if any) to choose best lgroup
*
* NOTE: Assumes that thread is protected from going away and its
* lgroup affinities won't change (ie. p_lock, or
*/
if (t->t_lgrp_affinity) {
return (lpl);
}
do {
/*
* Skip any lgroups outside of thread's pset
*/
if (++lgrpid > lgrp_alloc_max)
lgrpid = 0; /* wrap the search */
continue;
}
/*
* Skip any non-leaf lgroups
*/
continue;
/*
* Skip any lgroups without enough free memory
* (when threshold set to nonzero positive value)
*/
if (lgrp_mem_free_thresh > 0) {
if (npgs < lgrp_mem_free_thresh) {
if (++lgrpid > lgrp_alloc_max)
lgrpid = 0; /* wrap the search */
continue;
}
}
if (klgrpset_isempty(p->p_lgrpset) ||
/*
* Either this is a new process or the process already
* has threads on this lgrp, so this is a preferred
* lgroup for the thread.
*/
}
} else {
/*
* The process doesn't have any threads on this lgrp,
* but we're willing to consider this lgrp if the load
* difference is big enough to justify splitting up
* the process' threads.
*/
}
}
if (++lgrpid > lgrp_alloc_max)
lgrpid = 0; /* wrap the search */
} while (lgrpid != lgrpid_start);
/*
* Return root lgroup if threshold isn't set to maximum value and
* lowest lgroup load average more than a certain threshold
*/
if (lgrp_load_thresh != UINT32_MAX &&
/*
* If all the lgroups over which the thread's process is spread are
* heavily loaded, or otherwise undesirable, we'll consider placing
* the thread on one of the other leaf lgroups in the thread's
* partition.
*/
bestload))) {
}
/*
* No lgroup looked particularly good, but we still
* have to pick something. Go with the randomly selected
* legal lgroup we started with above.
*/
}
return (bestlpl);
}
/*
* Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
* Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
*/
static int
{
/* lpl1 is significantly less loaded than lpl2 */
return (1);
}
if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
/*
* lpl1's load is within the tolerance of lpl2. We're
* willing to consider it be to better however if
* it has been longer since we last homed a thread there
*/
return (1);
}
return (0);
}
/*
* An LWP is expected to be assigned to an lgroup for at least this long
* for its anticipatory load to be justified. NOTE that this value should
* not be set extremely huge (say, larger than 100 years), to avoid problems
* with overflow in the calculation that uses it.
*/
/*
* Routine to change a thread's lgroup affiliation. This routine updates
* the thread's kthread_t struct and its process' proc_t struct to note the
* thread's new lgroup affiliation, and its lgroup affinities.
*
* Note that this is the only routine that modifies a thread's t_lpl field,
* and that adds in or removes anticipatory load.
*
* If the thread is exiting, newlpl is NULL.
*
* Locking:
* The following lock must be held on entry:
* cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
* doesn't get removed from t's partition
*
* This routine is not allowed to grab any locks, since it may be called
* with cpus paused (such as from cpu_offline).
*/
void
{
proc_t *p;
ASSERT(t);
THREAD_LOCK_HELD(t));
/*
* If not changing lpls, just return
*/
return;
/*
* Make sure the thread's lwp hasn't exited (if so, this thread is now
* associated with process 0 rather than with its original process).
*/
if (t->t_proc_flag & TP_LWPEXIT) {
}
return;
}
p = ttoproc(t);
/*
* If the thread had a previous lgroup, update its process' p_lgrpset
* to account for it being moved from its old lgroup.
*/
if ((do_lgrpset_delete) &&
/*
* Check if a thread other than the thread
* that's moving is assigned to the same
* lgroup as the thread that's moving. Note
* that we have to compare lgroup IDs, rather
* than simply comparing t_lpl's, since the
* threads may belong to different partitions
* but be assigned to the same lgroup.
*/
if ((tp != t) &&
/*
* Another thread is assigned to the
* same lgroup as the thread that's
* moving, p_lgrpset doesn't change.
*/
break;
/*
* No other thread is assigned to the
* same lgroup as the exiting thread,
* clear the lgroup's bit in p_lgrpset.
*/
break;
}
}
}
/*
* If this thread was assigned to its old lgroup for such a
* short amount of time that the anticipatory load that was
* added on its behalf has aged very little, remove that
* anticipatory load.
*/
for (;;) {
do {
/*
* this can happen if the load
* average was aged since we
* added in the anticipatory
* load
*/
new = 0;
}
} while (cas32(
break;
}
}
}
/*
* If the thread has a new lgroup (i.e. it's not exiting), update its
* t_lpl and its process' p_lgrpset, and apply an anticipatory load
* to its new lgroup to account for its move to its new lgroup.
*/
/*
* This thread is moving to a new lgroup
*/
/*
* Reflect move in load average of new lgroup
* unless it is root lgroup
*/
return;
}
/*
* It'll take some time for the load on the new lgroup
* to reflect this thread's placement on it. We'd
* like not, however, to have all threads between now
* and then also piling on to this lgroup. To avoid
* this pileup, we anticipate the load this thread
* will generate on its new lgroup. The goal is to
* make the lgroup's load appear as though the thread
* had been there all along. We're very conservative
* in calculating this anticipatory load, we assume
* the worst case case (100% CPU-bound thread). This
* may be modified in the future to be more accurate.
*/
for (;;) {
do {
/*
* Check for overflow
* Underflow not possible here
*/
new = UINT32_MAX;
break;
}
}
}
/*
* Return lgroup memory allocation policy given advice from madvise(3C)
*/
{
switch (advice) {
case MADV_ACCESS_LWP:
return (LGRP_MEM_POLICY_NEXT);
case MADV_ACCESS_MANY:
return (LGRP_MEM_POLICY_RANDOM);
default:
}
}
/*
* Figure out default policy
*/
{
/*
* Randomly allocate memory across lgroups for shared memory
* beyond a certain threshold
*/
/*
* Get total memory size of current thread's pset
*/
/*
* Choose policy to randomly allocate memory across
* lgroups in pset if it will fit and is not default
* partition. Otherwise, allocate memory randomly
* across machine.
*/
else
} else
/*
* Apply default policy for private memory and
* shared memory under the respective random
* threshold.
*/
return (policy);
}
/*
* Get memory allocation policy for this segment
*/
{
extern struct seg_ops segspt_ops;
extern struct seg_ops segspt_shmops;
/*
* This is for binary compatibility to protect against third party
* segment drivers which haven't recompiled to allow for
* SEGOP_GETPOLICY()
*/
return (NULL);
policy_info = NULL;
return (policy_info);
}
/*
* Set policy for allocating private memory given desired policy, policy info,
* size in bytes of memory that policy is being applied.
* Return 0 if policy wasn't set already and 1 if policy was set already
*/
int
{
if (policy == LGRP_MEM_POLICY_DEFAULT)
/*
* Policy set already?
*/
return (1);
/*
* Set policy
*/
policy_info->mem_reserved = 0;
return (0);
}
/*
* Get shared memory allocation policy with given tree and offset
*/
{
/*
* Get policy segment tree from anon_map or vnode and use specified
* anon index or vnode offset as offset
*
* Assume that no lock needs to be held on anon_map or vnode, since
* they should be protected by their reference count which must be
* nonzero for an existing segment
*/
if (amp) {
if (shm_locality == NULL)
return (NULL);
} else if (vp) {
if (shm_locality == NULL)
return (NULL);
}
return (NULL);
/*
* Lookup policy segment for offset into shared object and return
* policy info
*/
policy_info = NULL;
if (policy_seg)
return (policy_info);
}
/*
* Default memory allocation policy for kernel segmap pages
*/
/*
* Return lgroup to use for allocating memory
* given the segment and address
*
* There isn't any mutual exclusion that exists between calls
* to this routine and DR, so this routine and whomever calls it
* should be mindful of the possibility that the lgrp returned
* may be deleted. If this happens, dereferences of the lgrp
* pointer will still be safe, but the resources in the lgrp will
* be gone, and LGRP_EXISTS() will no longer be true.
*/
lgrp_t *
{
int i;
int lgrps_spanned;
unsigned long off;
int stat = 0;
/*
* Just return null if the lgrp framework hasn't finished
* initializing or if this is a UMA machine.
*/
return (lgrp_root);
/*
* Get memory allocation policy for this segment
*/
if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
} else {
if (policy_info != NULL)
}
}
lgrpset = 0;
/*
* Initialize lgroup to home by default
*/
lgrp = lgrp_home_lgrp();
/*
* When homing threads on root lgrp, override default memory
* allocation policies with root lgroup memory allocation policy
*/
/*
* Implement policy
*/
switch (policy) {
case LGRP_MEM_POLICY_NEXT_CPU:
/*
* Return lgroup of current CPU which faulted on memory
* If the CPU isn't currently in an lgrp, then opt to
* allocate from the root.
*
* Kernel preemption needs to be disabled here to prevent
* the current CPU from going away before lgrp is found.
*/
if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
} else {
}
break;
case LGRP_MEM_POLICY_NEXT:
case LGRP_MEM_POLICY_DEFAULT:
default:
/*
* Just return current thread's home lgroup
* for default policy (next touch)
* If the thread is homed to the root,
* then the default policy is random across lgroups.
* Fallthrough to the random case.
*/
if (policy == LGRP_MEM_POLICY_NEXT)
else
LGRP_NUM_DEFAULT, 1);
break;
}
/* LINTED fallthrough on case statement */
case LGRP_MEM_POLICY_RANDOM:
/*
* Return a random leaf lgroup with memory
*/
/*
* Count how many lgroups are spanned
*/
/*
* There may be no memnodes in the root lgroup during DR copy
* rename on a system with only two boards (memnodes)
* configured. In this case just return the root lgrp.
*/
if (lgrps_spanned == 0) {
break;
}
/*
* Pick a random offset within lgroups spanned
* and return lgroup at that offset
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
if (!klgrpset_ismember(lgrpset, i))
continue;
if (off)
off--;
else {
lgrp = lgrp_table[i];
1);
break;
}
}
break;
/*
* Grab copy of bitmask of lgroups spanned by
* this process
*/
/* LINTED fallthrough on case statement */
if (!stat)
if (klgrpset_isempty(lgrpset)) {
/*
* Grab copy of bitmask of lgroups spanned by
* this processor set
*/
}
/*
* Count how many lgroups are spanned
*/
/*
* Probably lgrps_spanned should be always non-zero, but to be
* on the safe side we return lgrp_root if it is empty.
*/
if (lgrps_spanned == 0) {
break;
}
/*
* Pick a random offset within lgroups spanned
* and return lgroup at that offset
*/
for (i = 0; i <= lgrp_alloc_max; i++) {
if (!klgrpset_ismember(lgrpset, i))
continue;
if (off)
off--;
else {
lgrp = lgrp_table[i];
1);
break;
}
}
break;
/*
* Use offset within segment to determine
* offset from home lgroup to choose for
* next lgroup to allocate memory from
*/
(lgrp_alloc_max + 1);
while (off > 0) {
lgrp = lgrp_table[i];
if (klgrpset_ismember(lgrpset, i))
off--;
}
break;
}
return (lgrp);
}
/*
* Return the number of pages in an lgroup
*
* NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
* could cause tests that rely on the numat driver to fail....
*/
{
if (!LGRP_EXISTS(lgrp) ||
return (0);
}
/*
* Initialize lgroup shared memory allocation policy support
*/
void
{
/*
* Initialize locality field in anon_map
* Don't need any locks because this is called when anon_map is
* allocated, but not used anywhere yet.
*/
if (amp) {
/*
* Allocate and initialize shared memory locality info
* and set anon_map locality pointer to it
* Drop lock across kmem_alloc(KM_SLEEP)
*/
KM_SLEEP);
NULL);
/*
* Reacquire lock and check to see whether anyone beat
* us to initializing the locality info
*/
sizeof (*shm_locality));
} else
}
return;
}
/*
* Allocate shared vnode policy info if vnode is not locality aware yet
*/
/*
* Allocate and initialize shared memory locality info
*/
/*
* Point vnode locality field at shared vnode policy info
* and set locality aware flag in vnode
*/
} else {
/*
* Lost race so free locality info and increment count.
*/
}
return;
}
/*
* Increment reference count of number of segments mapping this vnode
* shared
*/
}
/*
* Destroy the given shared memory policy segment tree
*/
void
{
return;
}
}
/*
* Uninitialize lgroup shared memory allocation policy support
*/
void
{
/*
* For anon_map, deallocate shared memory policy tree and
* zero locality field
* Don't need any locks because anon_map is being freed
*/
if (amp) {
return;
return;
}
/*
* For vnode, decrement reference count of segments mapping this vnode
* shared and delete locality info if reference count drops to 0
*/
if (shm_locality->loc_count == 0) {
vp->v_locality = 0;
}
}
/*
* Compare two shared memory policy segments
* Used by AVL tree code for searching
*/
int
lgrp_shm_policy_compar(const void *x, const void *y)
{
lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
return (-1);
return (1);
return (0);
}
/*
* Concatenate seg1 with seg2 and remove seg2
*/
static int
{
return (-1);
return (0);
}
/*
* Split segment at given offset and return rightmost (uppermost) segment
* Assumes that there are no overlapping segments
*/
static lgrp_shm_policy_seg_t *
{
return (NULL);
return (seg);
/*
* Adjust size of left segment and allocate new (right) segment
*/
/*
* Find where to insert new segment in AVL tree and insert it
*/
return (newseg);
}
/*
* Set shared memory allocation policy on specified shared object at given
* offset and length
*
* Return 0 if policy wasn't set already, 1 if policy was set already, and
* -1 if can't set policy.
*/
int
{
int retval;
if (len == 0)
return (-1);
retval = 0;
/*
* Get locality info and starting offset into shared object
* Try anon map first and then vnode
* Assume that no locks need to be held on anon_map or vnode, since
* it should be protected by its reference count which must be nonzero
* for an existing segment.
*/
if (amp) {
/*
* Get policy info from anon_map
*
*/
} else if (vp) {
/*
* Get policy info from vnode
*/
} else
return (-1);
/*
* Figure out default policy
*/
if (policy == LGRP_MEM_POLICY_DEFAULT)
/*
* Create AVL tree if there isn't one yet
* and set locality field to point at it
*/
if (!tree) {
sizeof (lgrp_shm_policy_seg_t),
} else {
/*
* Another thread managed to set up the tree
* before we could. Free the tree we allocated
* and use the one that's already there.
*/
}
}
/*
* Set policy
*
* Need to maintain hold on writer's lock to keep tree from
* changing out from under us
*/
while (len != 0) {
/*
* Find policy segment for specified offset into shared object
*/
/*
* Didn't find any existing segment that contains specified
* offset, so allocate new segment, insert it, and concatenate
* with adjacent segments if possible
*/
KM_SLEEP);
/*
* Check to see whether new segment overlaps with next
* one, set length of new segment accordingly, and
* calculate remaining length and next offset
*/
len = 0;
} else {
}
/*
* Try to concatenate new segment with next and
* previous ones, since they might have the same policy
* now. Grab previous and next segments first because
* they will change on concatenation.
*/
continue;
}
/*
* Policy set already?
*/
/*
* Nothing left to do if offset and length
* fall within this segment
*/
retval = 1;
break;
} else {
continue;
}
}
/*
* Specified offset and length match existing segment exactly
*/
/*
* Set policy and update current length
*/
len = 0;
/*
* Try concatenating new segment with previous and next
* segments, since they might have the same policy now.
* Grab previous and next segments first because they
* will change on concatenation.
*/
} else {
/*
* Specified offset and length only apply to part of
* existing segment
*/
/*
* New segment starts in middle of old one, so split
* new one off near beginning of old one
*/
/*
* New segment ends where old one did, so try
* to concatenate with next segment
*/
(void) lgrp_shm_policy_concat(tree,
break;
}
}
/*
* New segment ends before old one, so split off end of
* old one
*/
if (newseg) {
(void) lgrp_shm_policy_split(tree,
} else {
eoff);
}
(void) lgrp_shm_policy_concat(tree,
break;
}
/*
* Calculate remaining length and next offset
*/
}
}
return (retval);
}
/*
* Return the best memnode from which to allocate memory given
* an lgroup.
*
* "c" is for cookie, which is good enough for me.
* It references a cookie struct that should be zero'ed to initialize.
* The cookie should live on the caller's stack.
*
* The routine returns -1 when:
* - traverse is 0, and all the memnodes in "lgrp" have been returned.
* - traverse is 1, and all the memnodes in the system have been
* returned.
*/
int
{
extern int max_mem_nodes;
/*
* If the set is empty, and the caller is willing, traverse
* up the hierarchy until we find a non-empty set.
*/
if (c->lmc_scope == LGRP_SRCH_LOCAL ||
return (-1);
}
/*
* Select a memnode by picking one at a "random" offset.
* Because of DR, memnodes can come and go at any time.
* This code must be able to cope with the possibility
* that the nodes count "cnt" is inconsistent with respect
* to the number of elements actually in "nodes", and
* therefore that the offset chosen could be greater than
* the number of elements in the set (some memnodes may
* have dissapeared just before cnt was read).
* If this happens, the search simply wraps back to the
* beginning of the set.
*/
do {
if (!offset--)
break;
} while (mnode >= max_mem_nodes);
/* Found a node. Store state before returning. */
c->lmc_ntried++;
return (mnode);
}