mpo.c revision 183ef8a1713ca188e24d970f22c6f9cc333007fd
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/sysmacros.h>
#include <sys/machsystm.h>
#include <sys/machparam.h>
#include <sys/mach_descrip.h>
#include <vm/hat_sfmmu.h>
/*
* MPO and the sun4v memory representation
* ---------------------------------------
*
* Latency groups are defined in the sun4v achitecture by memory-latency-group
* nodes in the Machine Description, as specified in FWARC/2007/260. These
* tie together cpu nodes and mblock nodes, and contain mask and match
* properties that identify the portion of an mblock that belongs to the
* lgroup. Mask and match are defined in the Physical Address (PA) space,
* but an mblock defines Real Addresses (RA). To translate, the mblock
* includes the property address-congruence-offset, hereafter referred to as
* ra_to_pa. A real address ra is a member of an lgroup if
*
* (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
*
* The MD is traversed, and information on all mblocks is kept in the array
* mpo_mblock[]. Information on all CPUs, including which lgroup they map
* to, is kept in the array mpo_cpu[].
*
* This implementation makes (and verifies) the simplifying assumption that
* the mask bits are the same for all defined lgroups, and that all 1 bits in
* the mask are contiguous. Thus the number of lgroups is bounded by the
* number of possible mask values, and the lgrp_handle_t is defined as the
* mask value, shifted right to eliminate the 0 bit positions in mask. The
* masks and values are also referred to as "home bits" in the code.
*
* A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
* has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
* containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
* home bits. This yields the mem_node.
*
* Interfaces
* ----------
*
* This file exports the following entry points:
*
* plat_lgrp_init()
* plat_build_mem_nodes()
* plat_lgrp_cpu_to_hand()
* plat_lgrp_latency()
* plat_pfn_to_mem_node()
* These implement the usual platform lgroup interfaces.
*
* plat_rapfn_to_papfn()
* Recover the PA page coloring bits from an RA.
*
* plat_mem_node_iterator_init()
* Initialize an iterator to efficiently step through pages in a mem_node.
*
* plat_mem_node_intersect_range()
* Find the intersection with a mem_node.
*
* plat_slice_add()
* plat_slice_del()
*
* Internal Organization
* ---------------------
*
* appropriate MPO structures.
*
* mblock_alloc()
* Allocate memory for mblocks and stripes as
* appropriate for boot or memory DR.
*
* mblock_free()
* Free memory allocated by mblock_alloc.
*
* mblock_update()
* Build mblocks based on mblock nodes read from the MD.
*
* mblock_update_add()
* Rebuild mblocks after a memory DR add operation.
*
* mblock_update_del()
* Rebuild mblocks after a memory DR delete operation.
*
* mblock_install()
* Install mblocks as the new configuration.
*
* mstripe_update()
* Build stripes based on mblocks.
*
* mnode_update()
*
* The platform interfaces allocate all memory required for the
* particualar update first, block access to the MPO structures
* while they are updated, and free old structures after the update.
*/
int sun4v_mpo_enable = 1;
int sun4v_mpo_debug = 0;
/* Save CPU info from the MD and associate CPUs with lgroups */
/* Save lgroup info from the MD */
#define MAX_MD_LGROUPS 32
static int n_lgrpnodes = 0;
static int n_locality_groups = 0;
static int max_locality_groups = 0;
static int szc_mask0 = 0;
/* Save mblocks from the MD */
#define SMALL_MBLOCKS_COUNT 8
static struct mblock_md *mpo_mblock;
static int n_mblocks = 0;
/* Save mem_node stripes calculate from mblocks and lgroups. */
static mem_stripe_t *mem_stripes;
static int n_mem_stripes = 0;
static int stripe_shift; /* stride/stripes expressed as a shift */
/* Save home mask and shift used to calculate lgrp_handle_t values */
static pfn_t home_mask_pfn = 0;
static int home_mask_shift = 0;
static uint_t home_mask_pfn_shift = 0;
/* Save lowest and highest latencies found across all lgroups */
static int lower_latency = 0;
static int higher_latency = 0;
static int mpo_genid; /* config gen; updated by mem DR */
static int fix_interleave(void);
static void mblock_install(mpo_config_t *);
static void mblock_free(mpo_config_t *);
static void mblock_update_add(mpo_config_t *);
static void mstripe_update(mpo_config_t *);
/* Debug support */
}
#else
#define MPO_DEBUG(...)
#endif /* DEBUG */
/* Record status message, viewable from mdb */
#define MPO_STATUS(args...) { \
}
/*
* The MPO locks are to protect the MPO metadata while that
* information is updated as a result of a memory DR operation.
* The read lock must be acquired to read the metadata and the
* write locks must be acquired to update it.
*/
#define mpo_rd_lock kpreempt_disable
#define mpo_rd_unlock kpreempt_enable
static void
{
}
static void
{
start_cpus();
}
/*
* Routine to read a uint64_t from a given md
*/
static int64_t
{
return (err);
}
static int
mblock_cmp(const void *a, const void *b)
{
return (-1);
return (0);
else
return (1);
}
static void
{
int (*)(const void *, const void *));
}
static void
mpo_update_tunables(void)
{
int i, ncpu_min;
/*
* lgrp_expand_proc_thresh is the minimum load on the lgroups
* this process is currently running on before considering
* expanding threads to another lgroup.
*
* lgrp_expand_proc_diff determines how much less the remote lgroup
* must be loaded before expanding to it.
*
* On sun4v CMT processors, threads share a core pipeline, and
* at less than 100% utilization, best throughput is obtained by
* spreading threads across more cores, even if some are in a
* different lgroup. Spread threads to a new lgroup if the
* current group is more than 50% loaded. Because of virtualization,
* lgroups may have different numbers of CPUs, but the tunables
* apply to all lgroups, so find the smallest lgroup and compute
* 50% loading.
*/
for (i = 0; i < n_lgrpnodes; i++) {
}
/* new home may only be half as loaded as the existing home to use it */
}
static mde_cookie_t
{
int n_cpunodes, i;
return (MDE_INVAL_ELEM_COOKIE);
if (rootnode == MDE_INVAL_ELEM_COOKIE)
return (MDE_INVAL_ELEM_COOKIE);
"fwd", &cpunodes);
goto cpuid_fail;
for (i = 0; i < n_cpunodes; i++) {
&cpuid_prop))
break;
return (foundnode);
}
}
if (n_cpunodes > 0)
return (MDE_INVAL_ELEM_COOKIE);
}
static int
{
/* Find lgroup nodes reachable from this cpu */
"fwd", &nodes);
lowest_latency = ~(0UL);
/* Find the lgroup node with the smallest latency */
for (j = 0; j < n_lgroups; j++) {
&latency);
if (result != 0) {
j = -1;
goto to_lgrp_done;
}
if (latency < lowest_latency) {
}
}
for (j = 0; j < n_lgrpnodes; j++) {
break;
}
if (j == n_lgrpnodes)
j = -1;
if (n_lgroups > 0)
return (j);
}
/* Called when DR'ing in a CPU */
void
{
int i;
if (n_lgrpnodes <= 0)
return;
goto add_fail;
if (cpunode == MDE_INVAL_ELEM_COOKIE)
goto add_fail;
if (i == -1)
goto add_fail;
mpo_lgroup[i].ncpu++;
return;
panic("mpo_cpu_add: Cannot read MD");
}
/* Called when DR'ing out a CPU */
void
mpo_cpu_remove(int cpuid)
{
int i;
if (n_lgrpnodes <= 0)
return;
mpo_lgroup[i].ncpu--;
}
static mde_cookie_t
{
int n_nodes;
if (n_nodes <= 0) {
MPO_STATUS("md_get_root: No nodes in node count\n");
return (root);
}
if (root == MDE_INVAL_ELEM_COOKIE) {
MPO_STATUS("md_get_root: Root node is missing\n");
return (root);
}
done:
return (root);
}
static int
{
int i, j, result;
int ret_val = 0;
int sub_page_fix;
"fwd", &lgrpnodes);
MPO_STATUS("lgrp_update: No Lgroups\n");
ret_val = -1;
goto fail;
}
for (i = 0; i < n_lgrpnodes; i++) {
mpo_lgroup[i].id = i;
mpo_lgroup[i].ncpu = 0;
&mpo_lgroup[i].addr_mask);
&mpo_lgroup[i].addr_match);
/*
* If either the mask or match properties are missing, set to 0
*/
if (result < 0) {
mpo_lgroup[i].addr_mask = 0;
mpo_lgroup[i].addr_match = 0;
}
/* Set latency to 0 if property not present */
&mpo_lgroup[i].latency);
if (result < 0)
mpo_lgroup[i].latency = 0;
}
/*
* Sub-page level interleave is not yet supported. Check for it,
* and remove sub-page interleaved lgroups from mpo_lgroup and
* n_lgrpnodes. If no lgroups are left, return.
*/
if (n_lgrpnodes == 0) {
ret_val = -1;
goto fail;
}
/* Ensure that all of the addr_mask values are the same */
for (i = 0; i < n_lgrpnodes; i++) {
MPO_STATUS("lgrp_update: "
"addr_mask values are not the same\n");
ret_val = -1;
goto fail;
}
}
/*
* Ensure that all lgrp nodes see all the mblocks. However, if
* sub-page interleave is being fixed, they do not, so skip
* the check.
*/
if (sub_page_fix == 0) {
for (i = 0; i < n_lgrpnodes; i++) {
if (j != n_mblocks) {
MPO_STATUS("lgrp_update: "
"sub-page interleave is being fixed\n");
ret_val = -1;
goto fail;
}
}
}
fail:
if (n_lgrpnodes > 0) {
for (i = 0; i < n_lgrpnodes; i++)
}
return (ret_val);
}
/*
*
* Traverse the MD to determine:
*
* Number of CPU nodes, lgrp_nodes, and mblocks
* Then for each lgrp_node, obtain the appropriate data.
* For each CPU, determine its home locality and store it.
* For each mblock, retrieve its data and store it.
*/
static int
{
int o;
uint64_t mem_lg_homeset = 0;
int ret_val = 0;
int result = 0;
int n_cpunodes = 0;
ret_val = -1;
goto fail;
}
&mblocknodes);
if (n_mblocks <= 0) {
MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
"Descriptor\n");
ret_val = -1;
goto fail;
}
/*
* Build the Memory Nodes. Do this before any possibility of
* bailing from this routine so we obtain ra_to_pa (needed for page
* coloring) even when there are no lgroups defined.
*/
ret_val = -1;
goto fail;
}
/* Page coloring hook is required so we can iterate through mnodes */
if (&page_next_pfn_for_color_cpu == NULL) {
MPO_STATUS("lgrp_traverse: No page coloring support\n");
ret_val = -1;
goto fail;
}
/* Global enable for mpo */
if (sun4v_mpo_enable == 0) {
MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
ret_val = -1;
goto fail;
}
MPO_STATUS("lgrp_traverse: No CPU nodes detected "
"in MD\n");
ret_val = -1;
goto fail;
}
goto fail;
/*
* Use the address mask from the first lgroup node
* to establish our home_mask.
*/
/*
* How many values are possible in home mask? Assume the mask
* bits are contiguous.
*/
/* Now verify the home mask bits are contiguous */
MPO_STATUS("lgrp_traverse: "
"home mask bits are not contiguous\n");
ret_val = -1;
goto fail;
}
/* Record all of the home bits */
for (i = 0; i < n_lgrpnodes; i++) {
}
/* Count the number different "home" mem_lg's we've discovered */
/* If we have only 1 locality group then we can exit */
if (n_locality_groups == 1) {
MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
ret_val = -1;
goto fail;
}
/*
* Set the latencies. A CPU's lgroup is defined by the lowest
* latency found. All other memory is considered remote, and the
* remote latency is represented by the highest latency found.
* Thus hierarchical lgroups, if any, are approximated by a
* two level scheme.
*
* The Solaris MPO framework by convention wants to see latencies
* in units of nano-sec/10. In the MD, the units are defined to be
* pico-seconds.
*/
for (i = 1; i < n_lgrpnodes; i++) {
}
}
}
lower_latency /= 10000;
higher_latency /= 10000;
/* Clear our CPU data */
for (i = 0; i < NCPU; i++) {
}
/* Build the CPU nodes */
for (i = 0; i < n_cpunodes; i++) {
/* Read in the lgroup nodes */
if (result < 0) {
MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
ret_val = -1;
goto fail;
}
if (o == -1) {
ret_val = -1;
goto fail;
}
mpo_cpu[k].lgrp_index = o;
mpo_lgroup[o].ncpu++;
}
/* Validate that no large pages cross mnode boundaries. */
ret_val = -1;
goto fail;
}
fail:
if (n_cpunodes > 0)
if (n_mblocks > 0)
else
panic("lgrp_traverse: No memory blocks found");
if (ret_val == 0) {
MPO_STATUS("MPO feature is enabled.\n");
} else
sun4v_mpo_enable = 0; /* set this for DR */
return (ret_val);
}
/*
* Determine the number of unique mem_lg's present in our system
*/
static int
{
int homeid;
int count = 0;
/*
* Scan the "home" bits of the mem_lgs, count
* the number that are unique.
*/
count++;
}
}
MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
/* Default must be at least one */
if (count == 0)
count = 1;
return (count);
}
/*
* Platform specific lgroup initialization
*/
void
plat_lgrp_init(void)
{
int rc;
/* Get the Machine Descriptor handle */
md = md_get_handle();
/* If not, we cannot continue */
panic("cannot access machine descriptor\n");
} else {
(void) md_fini_handle(md);
}
/*
* If we can't process the MD for lgroups then at least let the
* system try to boot. Assume we have one lgroup so that
* when plat_build_mem_nodes is called, it will attempt to init
* an mnode based on the supplied memory segment.
*/
if (rc == -1) {
home_mask_pfn = 0;
max_locality_groups = 1;
n_locality_groups = 1;
return;
}
mem_node_pfn_shift = 0;
mem_node_physalign = 0;
/* Use lgroup-aware TSB allocations */
tsb_lgrp_affinity = 1;
/* Require that a home lgroup have some memory to be chosen */
lgrp_mem_free_thresh = 1;
/* Standard home-on-next-touch policy */
/* Disable option to choose root lgroup if all leaf lgroups are busy */
}
/*
* Helper routine for debugging calls to mem_node_add_slice()
*/
static void
{
static int slice_count = 0;
slice_count++;
MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n",
#endif
}
static void
{
static int slice_count = 0;
slice_count++;
MPO_DEBUG("mem_del_slice(%d): basepfn: %lx endpfn: %lx\n",
#endif
}
/*
* Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
*/
static void
{
MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
}
/*
* plat_build_mem_nodes()
*
* Define the mem_nodes based on the modified boot memory list,
* or based on info read from the MD in plat_lgrp_init().
*
* When the home mask lies in the middle of the address bits (as it does on
* Victoria Falls), then the memory in one mem_node is no longer contiguous;
* it is striped across an mblock in a repeating pattern of contiguous memory
* followed by a gap. The stripe width is the size of the contiguous piece.
* The stride is the distance from the start of one contiguous piece to the
* start of the next. The gap is thus stride - stripe_width.
*
* The stripe of an mnode that falls within an mblock is described by the type
* mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The
* mem_stripe_t's are kept in a global array mem_stripes[]. The index into
* this array is predetermined. The mem_stripe_t that describes mnode m
* within mpo_mblock[i] is stored at
* mem_stripes[ m + i * max_locality_groups ]
*
* max_locality_groups is the total number of possible locality groups,
* as defined by the size of the home mask, even if the memory assigned
* to the domain is small and does not cover all the lgroups. Thus some
* mem_stripe_t's may be empty.
*
* The members of mem_stripe_t are:
* physbase: First valid page in mem_node in the corresponding mblock
* physmax: Last valid page in mem_node in mblock
* offset: The full stripe width starts at physbase - offset.
* Thus if offset is non-zero, this mem_node starts in the middle
* of a stripe width, and the second full stripe starts at
* physbase - offset + stride. (even though physmax may fall in the
* middle of a stripe width, we do not save the ending fragment size
* in this data structure.)
* exists: Set to 1 if the mblock has memory in this mem_node stripe.
*
* The stripe width is kept in the global mnode_pages.
* The stride is kept in the global mnode_stride.
* All the above use pfn's as the unit.
*
* As an example, the memory layout for a domain with 2 mblocks and 4
* mem_nodes 0,1,2,3 could look like this:
*
* 123012301230 ... 012301230123 ...
* mblock 0 mblock 1
*/
/*ARGSUSED*/
void
{
int elem;
/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
/* Check for non-MPO sun4v platforms */
if (n_locality_groups <= 1) {
}
mem_node_pfn_shift = 0;
mem_node_physalign = 0;
} else
/*
* Indicate to vm_pagelist that the hpm_counters array
* should be shared because the ranges overlap.
*/
if (max_mem_nodes > 1) {
interleaved_mnodes = 1;
}
}
/*
* Return the locality group value for the supplied processor
*/
{
mpo_rd_lock();
if (n_locality_groups > 1) {
} else {
}
return (lgrphand);
}
int
{
/*
* Return min remote latency when there are more than two lgroups
* (root and child) and getting latency between two different lgroups
* or root is involved.
*/
return ((int)higher_latency);
} else {
return ((int)lower_latency);
}
}
int
{
int i, mnode;
if (n_locality_groups <= 1)
return (0);
/*
* The mnode is defined to be 1:1 with the lgroup handle, which
* is taken from from the home bits. Find the mblock in which
* the pfn falls to get the ra_to_pa adjustment, and extract
* the home bits.
*/
mpo_rd_lock();
mb = &mpo_mblock[0];
for (i = 0; i < n_mblocks; i++) {
return (mnode);
}
mb++;
}
return (pfn);
}
/*
* plat_rapfn_to_papfn
*
* Convert a pfn in RA space to a pfn in PA space, in which the page coloring
* and home mask bits are correct. The upper bits do not necessarily
* match the actual PA, however.
*/
{
int i;
if (n_mblocks == 1)
return (pfn + base_ra_to_pa_pfn);
/*
* Find the mblock in which the pfn falls
* in order to get the ra_to_pa adjustment.
*/
mpo_rd_lock();
return (pfn + ra_to_pa_pfn);
}
}
return (pfn);
}
/*
* plat_mem_node_iterator_init()
* Initialize cookie "it" to iterate over pfn's in an mnode. There is
* no additional iterator function. The caller uses the info from
* the iterator structure directly.
*
* pfn: starting pfn.
* mnode: desired mnode.
* szc: desired page size.
* init:
* if 1, start a new traversal, initialize "it", find first
* mblock containing pfn, and return its starting pfn
* within the mnode.
* if 0, continue the previous traversal using passed-in data
* from "it", advance to the next mblock, and return its
* starting pfn within the mnode.
* it: returns readonly data to the caller; see below.
*
* The input pfn must be aligned for the page size szc.
*
* which is aligned according to the page size,
* or returns (pfn_t)(-1) if the input pfn lies past the last
* valid pfn of the mnode.
* Returns misc values in the "it" struct that allows the caller
* to advance the pfn within an mblock using address arithmetic;
* see definition of mem_node_iterator_t in vm_dep.h.
* When the caller calculates a pfn that is greater than the
* returned value it->mi_mblock_end, the caller should again
* call plat_mem_node_iterator_init, passing init=0.
*
* The last mblock in continuation case may be invalid because
* of memory DR. To detect this situation mi_genid is checked
* against mpo_genid which is incremented after a memory DR
* operation. See also plat_slice_add()/plat_slice_del().
*/
{
int i;
mpo_rd_lock();
it->mi_last_mblock = 0;
}
/* Check if mpo is not enabled and we only have one mblock */
goto done;
}
it->mi_mnode_pfn_mask = 0;
it->mi_mnode_pfn_shift = 0;
it->mi_mnode_mask = 0;
goto done;
}
/* init=1 means begin iterator, init=0 means continue */
if (init == 1) {
i = 0;
} else {
i = it->mi_last_mblock;
if (++i == n_mblocks) {
goto done;
}
}
/*
* Find mblock that contains pfn for mnode's stripe, or first such an
* mblock after pfn, else pfn is out of bound and we'll return -1.
* mblocks and stripes are sorted in ascending address order.
*/
for (; i < n_mblocks; i++) {
continue;
break;
}
if (i == n_mblocks) {
goto done;
}
it->mi_last_mblock = i;
mblock = &mpo_mblock[i];
}
done:
return (pfn);
}
/*
* plat_mem_node_intersect_range()
*
* Find the intersection between a memnode and a range of pfn's.
*/
void
{
int i, npages;
*npages_out = 0;
return;
return;
if (n_locality_groups == 1) {
return;
}
npages = 0;
/*
* Iterate over all the stripes for this mnode (one per mblock),
* find the intersection with each, and accumulate the intersections.
*
* Determing the intersection with a stripe is tricky. If base or end
* mem_node. If base or end fall in a gap, round them to start of
* nearest stripe. If they fall within a stripe, keep base or end,
* but calculate the fragment size that should be excluded from the
* stripe. Calculate how many strides fall in the adjusted range,
* multiply by stripe width, and add the start and end fragments.
*/
mpo_rd_lock();
ms = &mem_stripes[i];
/* Round test_base to next multiple of stride */
/*
* Compute distance from test_base to the
* stride boundary to see if test_base falls
* in the stripe or in the hole.
*/
/*
* test_base lies in stripe,
* and offset should be excluded.
*/
(nearest - mnode_stride);
} else {
/* round up to next stripe start */
offset = 0;
continue;
}
}
end++; /* adjust to an exclusive bound */
/* Round end to next multiple of stride */
/* end falls in hole, use entire last stripe */
frag = 0;
} else {
/* end falls in stripe, compute fragment */
}
}
}
*npages_out = npages;
}
/*
* valid_pages()
*
* Return 1 if pages are valid and do not cross mnode boundaries
* (which would break page free list assumptions), and 0 otherwise.
*/
static int
{
int i, max_szc;
/*
* Find the smaller of the largest page possible and supported.
* mmu_exported_pagesize_mask is not yet initialized, so read
* it from the MD. Apply minimal fixups in case of broken MDs
* to get a sane mask.
*/
else {
szc_mask = 0;
/* largest in sun4v default support */
}
/*
* Page coalescing code coalesces all sizes up to 256M on sun4v, even
* if mmu-page-size-list does not contain it, so 256M pages must fall
* within one mnode to use MPO.
*/
MPO_STATUS("Page too large; MPO disabled: page = %lx, "
return (0);
}
for (i = 0; i < n_mblocks; i++) {
/*
* If mblock is smaller than the max page size, then
* RA = PA mod MAXPAGE is not guaranteed, but it must
* not span mnodes.
*/
MPO_STATUS("Small mblock spans mnodes; "
"MPO disabled: base = %lx, end = %lx, "
return (0);
}
} else {
/* Verify RA = PA mod MAXPAGE, using coalesce size */
MPO_STATUS("bad page alignment; MPO disabled: "
"ra = %lx, pa = %lx, pagelen = %lx\n",
return (0);
}
}
/*
* Find start of last large page in mblock in RA space.
* If page extends into the next mblock, verify the
* mnode does not change.
*/
if (i + 1 < n_mblocks &&
MPO_STATUS("Large page spans mblocks; MPO disabled: "
"end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
return (0);
}
mb++;
}
return (1);
}
/*
* fix_interleave() - Find lgroups with sub-page sized memory interleave,
* if any, and remove them. This yields a config where the "coarse
* grained" lgroups cover all of memory, even though part of that memory
* is fine grain interleaved and does not deliver a purely local memory
* latency.
*
* This function reads and modifies the globals:
* mpo_lgroup[], n_lgrpnodes
*
* Returns 1 if lgroup nodes were removed, 0 otherwise.
*/
static int
fix_interleave(void)
{
int i, j;
j = 0;
for (i = 0; i < n_lgrpnodes; i++) {
/* remove this lgroup */
} else {
mpo_lgroup[j++] = mpo_lgroup[i];
}
}
n_lgrpnodes = j;
if (mask != 0)
MPO_STATUS("sub-page interleave %lx found; "
"removing lgroup.\n", mask);
return (mask != 0);
}
/*
* mblock_alloc
*
* Allocate memory for mblock an stripe arrays from either static or
* dynamic space depending on utype, and return the result in mc.
* Returns 0 on success and -1 on error.
*/
static int
{
/*
* Allocate space for mblocks and mstripes.
*
* For DR allocations, just use kmem_alloc(), and set
* mc_alloc_sz to indicate it was used.
*
* For boot allocation:
* If we have a small number of mblocks we will use the space
* that we preallocated. Otherwise, we will dynamically
* allocate the space from the prom and map it to the
* reserved VA at MPOBUF_BASE.
*/
} else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
mb = &small_mpo_mblocks[0];
ms = &small_mem_stripes[0];
mc->mc_alloc_sz = 0;
} else {
/* Ensure that we dont request more space than reserved */
if (allocsz > MPOBUF_SIZE) {
MPO_STATUS("mblock_alloc: Insufficient space "
"for mblock structures \n");
return (-1);
}
MPO_STATUS("mblock_alloc: Cannot allocate space "
"for mblocks \n");
return (-1);
}
mc->mc_alloc_sz = 0;
}
return (0);
}
/*
* mblock_free
*
* Free memory in mc that was allocated by mblock_alloc.
*/
static void
{
if (mc->mc_alloc_sz > 0) {
}
}
/*
* mblock_install
*
* Install mblock config passed in mc as the global configuration.
* May only be called at boot or while holding mpo_wr_lock.
*/
static void
{
mpo_config = *mc;
}
/*
* mblock_update
*
* Traverse mblocknodes, read the mblock properties from the MD, and
* save the mblocks in mc.
*/
static void
{
uint64_t i, j;
int result = 0;
for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
/* Without a base or size value we will fail */
if (result < 0) {
MPO_STATUS("mblock_update: "
"PROP_LG_BASE is missing\n");
mc->mc_nmblocks = 0;
return;
}
if (result < 0) {
MPO_STATUS("mblock_update: "
"PROP_LG_SIZE is missing\n");
mc->mc_nmblocks = 0;
return;
}
/* If we don't have an ra_pa_offset, just set it to 0 */
if (result < 0)
MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
"ra_to_pa = %lx\n", i,
/* check for unsupportable values of base and size */
MPO_STATUS("mblock_update: "
"PROP_LG_BASE+PROP_LG_SIZE is invalid: "
"base = %lx, size = %lx\n",
mc->mc_nmblocks = 0;
return;
}
/* eliminate size==0 blocks */
i++;
}
}
if (i == 0) {
MPO_STATUS("mblock_update: "
"No non-empty mblock nodes were found "
"in the Machine Descriptor\n");
mc->mc_nmblocks = 0;
return;
}
mc->mc_nmblocks = i;
/* Must sort mblocks by address for mem_node_iterator_init() */
}
/*
* mblock_update_add
*
* Update mblock config after a memory DR add. The added range is not
* needed, as we read *all* mblock nodes from the MD. Save the mblocks
* in mc.
*/
static void
{
int nmblocks = 0;
MPO_STATUS("Cannot access Machine Descriptor\n");
goto error;
}
goto error;
&mblocknodes);
if (nmblocks <= 0) {
MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
goto error;
}
goto error;
(void) md_fini_handle(md);
return;
panic("mblock_update_add: cannot process mblocks from MD.\n");
}
/*
* mblock_update_del
*
* Update mblocks after a memory DR deletion of the range (ubase, uend).
* Allocate a new mblock config, copy old config to the new, modify the new
* mblocks to reflect the deletion. The new mblocks are returned in
* mc_new and are not yet installed as the active config.
*/
static void
{
int i, j;
/*
* Allocate mblocks in mc_new and copy the old to the new.
* Allocate one extra in case the deletion splits an mblock.
*/
return;
/*
* Find the mblock containing the deleted range and adjust it in
* the new config.
*/
for (i = 0; i < nmblocks; i++) {
/*
* Adjust the mblock based on the subset that was deleted.
*
* If the entire mblk was deleted, compact the table.
*
* If the middle of the mblk was deleted, extend
* the table. Space for the new slot was already
* allocated.
*
* The memory to be deleted is a mblock or a subset of
* and does not span multiple mblocks.
*/
for (j = i; j < nmblocks - 1; j++)
nmblocks--;
break;
for (j = nmblocks - 1; j >= i; j--)
nmblocks++;
break;
MPO_DEBUG("mblock_update_del: shrink>"
break;
MPO_DEBUG("mblock_update_del: shrink<"
break;
}
}
}
/*
* mstripe_update
*
* Read mblocks from mc and update mstripes in mc
*/
static void
{
int i, mnode;
/* Check for non-MPO sun4v platforms or memory DR removal */
if (n_locality_groups <= 1) {
if (nmblocks == 1) {
mc->mc_nstripes = 0;
} else {
for (i = 0; i < nmblocks; i++) {
}
}
return;
}
for (i = 0; i < nmblocks; i++) {
/* Find the offset from the prev stripe boundary in PA space. */
/* Set the next stripe boundary. */
/*
* Loop over all lgroups covered by the mblock, creating a
* stripe for each. Stop when lgrp_start is visited again.
*/
do {
/* mblock may not span all lgroups */
break;
/*
* Calculate the size of the fragment that does not
* belong to the mnode in the last partial stride.
*/
if (frag == 0) {
/* remove the gap */
/* fragment fits in stripe; keep it all */
remove = 0;
} else {
/* fragment is large; trim after whole stripe */
}
base = stripe_end;
stripe_end += stripe;
offset = 0;
} while (lgrphand != lgrp_start);
}
}
#define INTERSECT(a, b, c, d) \
if (((a) >= (c) && (a) <= (d)) || \
((c) >= (a) && (c) <= (b))) { \
(c) = MAX((a), (c)); \
(d) = MIN((b), (d)); \
} else { \
ASSERT((a) >= (d) || (b) <= (c)); \
continue; \
} \
/*
* mnode_update
*
* Read stripes from mc and update mnode extents. The mnode extents are
* part of the live configuration, so this can only be done at boot time
* or while holding the mpo_wr_lock.
*/
static void
{
else
return;
}
found = 0;
for (i = 0; i < mc->mc_nmblocks; i++) {
j = i * max_locality_groups + mnode;
continue;
/*
* Look for the mstripes intersecting this slice.
*
* The mstripe and slice pairs may not be equal
*/
switch (utype) {
case U_ADD:
/*FALLTHROUGH*/
case U_ADD_ALL:
if (n_locality_groups > 1)
break;
case U_DEL:
break;
default:
break;
}
found++;
}
}
if (!found)
panic("mnode_update: mstripe not found");
#ifdef DEBUG
return;
found = 0;
for (i = 0; i < max_mem_nodes; i++) {
if (!mem_node_config[i].exists)
continue;
found |= 1;
found |= 2;
}
{
}
#endif
}
/*
* Plat_slice_add()/plat_slice_del() are the platform hooks
*
*
* - Zeus has already added the mblocks to the MD, so read the updated
* MD and allocate all data structures required to manage the new memory
* configuration.
*
* - Recompute the stripes which are derived from the mblocks.
*
* - Update (expand) the mnode extents and install the modified mblocks as
* the new mpo config. This must be done while holding the mpo_wr_lock
* to guarantee that no other threads access the mpo meta-data.
*
* - Unlock MPO data structures; the new config is live. Free the old config.
*
* Plat_slice_del() is used for DR only.
*
* - Zeus has not yet modified the MD to reflect the deletion, so copy
* the old mpo mblocks and delete the range from the copy.
*
* - Recompute the stripes which are derived from the mblocks.
*
* - Update (shrink) the mnode extents and install the modified mblocks as
* the new mpo config. This must be done while holding the mpo_wr_lock
* to guarantee that no other threads access the mpo meta-data.
*
* - Unlock MPO data structures; the new config is live. Free the old config.
*/
void
{
mpo_wr_lock();
/* Use new config to add all ranges for mnode_update */
mpo_genid++;
}
void
{
mpo_wr_lock();
/* Use old config to find deleted range for mnode_update */
mpo_genid++;
}