vm_dep.c revision d0662dbfa1a7064416d570c479c4dc3a0782a4f8
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* UNIX machine dependent virtual memory support.
*/
#include <vm/seg_kmem.h>
#include <sys/archsystm.h>
#include <sys/machsystm.h>
#include <sys/cpu_module.h>
#include <vm/hat_sfmmu.h>
#include <sys/mem_config.h>
#include <sys/mem_cage.h>
#include <sys/platform_module.h>
/*
* These variables are set by module specific config routines.
* They are only set by modules which will use physical cache page coloring
*/
int do_pg_coloring = 0;
int do_virtual_coloring = 0;
/*
* These variables can be conveniently patched at kernel load time to
* prevent do_pg_coloring or do_virtual_coloring from being enabled by
* module specific config routines.
*/
int use_page_coloring = 1;
int use_virtual_coloring = 1;
/*
* initialized by page_coloring_init()
*/
extern uint_t page_colors;
extern uint_t page_colors_mask;
extern uint_t page_coloring_shift;
int cpu_page_colors;
uint_t vac_colors = 0;
uint_t vac_colors_mask = 0;
/*
* get the ecache setsize for the current cpu.
*/
/*
* This variable is set by the cpu module to contain the lowest
* address not affected by the SF_ERRATA_57 workaround. It should
* remain 0 if the workaround is not needed.
*/
#if defined(SF_ERRATA_57)
#endif
extern int disable_auto_large_pages; /* used by map_pgsz*() routines */
/*
* these must be defined in platform specific areas
*/
/*
* Convert page frame number to an OBMEM page frame number
* (i.e. put in the type bits -- zero for this implementation)
*/
{
return (pf);
}
/*
* Use physmax to determine the highest physical page of DRAM memory
* It is assumed that any physical addresses above physmax is in IO space.
* We don't bother checking the low end because we assume that memory space
* begins at physical page frame 0.
*
* Return 1 if the page frame is onboard DRAM memory, else 0.
* Returns 0 for nvram so it won't be cached.
*/
int
{
/* We must be IO space */
return (0);
/* We must be memory space */
return (1);
}
/*
* Handle a pagefault.
*/
{
struct proc *p;
int err;
if (INVALID_VADDR(addr))
return (FC_NOMAP);
if (iskernel) {
} else {
p = curproc;
#if defined(SF_ERRATA_57)
/*
* Prevent infinite loops due to a segment driver
* setting the execute permissions and the sfmmu hat
* silently ignoring them.
*/
addr < errata57_limit) {
goto out;
}
#endif
}
/*
* Dispatch pagefault.
*/
/*
* If this isn't a potential unmapped hole in the user's
* UNIX data or stack segments, just return status info.
*/
goto out;
/*
* Check to see if we happened to faulted on a currently unmapped
* part of the UNIX data or stack segments. If so, create a zfod
* mapping there and then try calling the fault routine again.
*/
/* not in either UNIX data or stack segments */
goto out;
}
}
/* the rest of this function implements a 3.X 4.X 5.X compatibility */
/* This code is probably not needed anymore */
/* expand the gap to the page boundaries on each side */
if (err) {
goto out;
}
} else {
/*
* This page is already mapped by another thread after we
* returned from as_fault() above. We just fallthrough
* as_fault() below.
*/
}
out:
return (res);
}
/*
* This is the routine which defines the address limit implied
* by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest
* mappable address in a 32-bit process on this platform (though
* perhaps we should make it be UINT32_MAX here?)
*/
void
{
}
/*
* Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
*/
/*
* kpm mapping window
*/
/*
* Determine whether [base, base+len] contains a mapable range of
* addresses at least minlen long. base and len are adjusted if
* required to provide a mapable range.
*/
/* ARGSUSED */
int
{
/*
* If hi rolled over the top, try cutting back.
*/
return (0);
return (0);
return (0);
/*
* Deal with a possible hole in the address range between
* hole_start and hole_end that should never be mapped by the MMU.
*/
if (lo < hole_start) {
if (hi > hole_start)
hi = hole_start;
else
/* lo < hole_start && hi >= hole_end */
/*
* prefer lowest range
*/
hi = hole_start;
else
return (0);
} else {
/*
* prefer highest range
*/
hi = hole_start;
else
return (0);
}
} else {
/* lo >= hole_start */
return (0);
}
return (0);
return (1);
}
/*
* Determine whether [addr, addr+len] with protections `prot' are valid
* for a user address space.
*/
/*ARGSUSED*/
int
{
return (RANGE_BADADDR);
/*
* Determine if the address range falls within an illegal
* range of the MMU.
*/
return (RANGE_BADADDR);
#if defined(SF_ERRATA_57)
/*
* Make sure USERLIMIT isn't raised too high
*/
errata57_limit == 0);
if (AS_TYPE_64BIT(as) &&
(addr < errata57_limit) &&
return (RANGE_BADPROT);
#endif /* SF_ERRATA57 */
return (RANGE_OKAY);
}
/*
* Routine used to check to see if an a.out can be executed
* by the current machine/architecture.
*/
int
{
return (0);
else
return (ENOEXEC);
}
/*
* The following functions return information about an a.out
* which is used when a program is executed.
*/
/*
* Return the load memory address for the data segment.
*/
{
/*
* XXX - Sparc Reference Hack approaching
* Remember that we are loading
* 8k executables into a 4k machine
* DATA_ALIGN == 2 * PAGESIZE
*/
else
}
/*
* Return the starting disk address for the data segment.
*/
{
else
}
/*
* Return the load memory address for the text segment.
*/
/*ARGSUSED*/
{
}
/*
* Return the file byte offset for the text segment.
*/
{
return (0);
else
return (sizeof (struct exec));
}
void
int *pagetext,
int *pagedata)
{
*pagetext = 1;
*pagedata = 1;
} else {
*pagetext = 0;
*pagedata = 0;
}
}
if (disable_auto_large_pages & (1 << (n))) \
continue; \
break; \
} \
}
/*ARGSUSED*/
{
int n, upper;
/*
* Select the best fit page size within the constraints of
* auto_lpg_{min,max}szc.
*
* Note that we also take the heap size into account when
* deciding if we've crossed the threshold at which we should
* increase the page size. This isn't perfect since the heap
* may not have reached its full size yet, but it's better than
* not considering it at all.
*/
/*
* Use auto_lpg_minszc - 1 as the limit so we never drop
* below auto_lpg_minszc. We don't have a size code to refer
* to like we have for bss and stack, so we assume 0.
* auto_lpg_minszc should always be >= 0. Using
* auto_lpg_minszc cuts off the loop.
*/
}
return (pgsz);
}
{
/*
* If len is zero, retrieve from proc and don't demote the page size.
*/
if (len == 0) {
}
/*
* Still zero? Then we don't have a heap yet, so pick the default
* heap size.
*/
if (len == 0) {
} else {
}
/*
* We're past the threshold, so select the best fit
* page size within the constraints of
* auto_lpg_{min,max}szc and the minimum required
* alignment.
*/
}
/*
* If addr == 0 we were called by memcntl() or exec_args() when the
* size code is 0. Don't set pgsz less than current size.
*/
}
return (pgsz);
}
{
/*
* If len is zero, retrieve from proc and don't demote the page size.
*/
if (len == 0) {
}
/*
* Still zero? Then we don't have a heap yet, so pick the default
* stack size.
*/
if (len == 0) {
} else {
}
/*
* We're past the threshold, so select the best fit
* page size within the constraints of
* auto_lpg_{min,max}szc and the minimum required
* alignment.
*/
}
/*
* If addr == 0 we were called by memcntl() or exec_args() when the
* size code is 0. Don't set pgsz less than current size.
*/
}
return (pgsz);
}
/*
* Return non 0 value if the address may cause a VAC alias with KPM mappings.
* KPM selects an address such that it's equal offset modulo shm_alignment and
* assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
*/
int
{
if (vac) {
} else {
return (0);
}
}
/*
* use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m
* can be set in platform or CPU specific code but user can change the
*
* Initial values are defined in architecture specific mach_vm_dep.c file.
*/
extern int use_text_pgsz64k;
extern int use_text_pgsz4m;
extern int use_initdata_pgsz64k;
/*
* disable_text_largepages and disable_initdata_largepages bitmaks are set in
* platform or CPU specific code to disable page sizes that should not be
* particular page size for text or inititialized data will be used by default
* if both one of use_* variables is set to 1 AND this page size is not
* disabled in the corresponding disable_* bitmask variable.
*
* Initial values are defined in architecture specific mach_vm_dep.c file.
*/
extern int disable_text_largepages;
extern int disable_initdata_largepages;
/*
* Minimum segment size tunables before 64K or 4M large pages
* should be used to map it.
*
* Initial values are defined in architecture specific mach_vm_dep.c file.
*/
extern size_t text_pgsz64k_minsize;
extern size_t text_pgsz4m_minsize;
extern size_t initdata_pgsz64k_minsize;
/*
* Sanity control. Don't use large pages regardless of user
* settings if there's less than execseg_lpg_min_physmem memory installed.
* The units for this variable is 8K pages.
*/
/* assumes TTE8K...TTE4M == szc */
static uint_t
{
caddr_t a;
if (len < text_pgsz4m_minsize) {
return (0);
}
return (0);
}
if (len < MMU_PAGESIZE4M) {
return (0);
}
return (1 << TTE4M);
}
static uint_t
{
caddr_t a;
if (len < text_pgsz64k_minsize) {
return (0);
}
return (0);
}
if (len < MMU_PAGESIZE64K) {
return (0);
}
if (!use_text_pgsz4m ||
return (1 << TTE64K);
}
if (svlen < text_pgsz4m_minsize) {
return (1 << TTE64K);
}
addr = a;
return (1 << TTE64K);
}
if (len < MMU_PAGESIZE4M) {
return (1 << TTE64K);
}
}
static uint_t
{
caddr_t a;
if (len < initdata_pgsz64k_minsize) {
return (0);
}
return (0);
}
if (len < MMU_PAGESIZE64K) {
return (0);
}
return (1 << TTE64K);
}
/*
* Return a bit vector of large page size codes that
* can be used to map [addr, addr + len) region.
*/
{
if (physmem < execseg_lpg_min_physmem) {
return (0);
}
if (text) {
if (use_text_pgsz64k &&
} else if (use_text_pgsz4m &&
}
} else if (use_initdata_pgsz64k &&
}
return (ret);
}
/*
* Anchored in the table below are counters used to keep track
* of free contiguous physical memory. Each element of the table contains
* the array of counters, the size of array which is allocated during
* startup based on physmax and a shift value used to convert a pagenum
* into a counter array index or vice versa. The table has page size
* for rows and region size for columns:
*
* page_counters[page_size][region_size]
*
* page_size: TTE size code of pages on page_size freelist.
*
* region_size: TTE size code of a candidate larger page made up
* made up of contiguous free page_size pages.
*
* As you go across a page_size row increasing region_size each
* element keeps track of how many (region_size - 1) size groups
* made up of page_size free pages can be coalesced into a
* regsion_size page. Yuck! Lets try an example:
*
* page_counters[1][3] is the table element used for identifying
* candidate 4M pages from contiguous pages off the 64K free list.
* Each index in the page_counters[1][3].array spans 4M. Its the
* number of free 512K size (regsion_size - 1) groups of contiguous
* 64K free pages. So when page_counters[1][3].counters[n] == 8
* we know we have a candidate 4M page made up of 512K size groups
* of 64K free pages.
*/
/*
* Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
* dimensions are allocated dynamically.
*/
/*
* For now there is only a single size cache list.
* Allocated dynamically.
*/
{
int mtype;
/*
* We only support small pages in the cachelist.
*/
/*
* Allocate freelists bins for all
* supported page sizes.
*/
(page_t **)alloc_base;
alloc_base += ((sizeof (page_t *) *
}
}
return (alloc_base);
}
/*
* Allocate page_freelists bin headers for a memnode from the
* nucleus data area. This is the first time that mmu_page_sizes is
* used during sun4u bootup, so check mmu_page_sizes initialization.
*/
int
{
int mtype;
if (&mmu_init_mmu_page_sizes) {
if (!mmu_init_mmu_page_sizes(allp)) {
}
}
/* first time called - allocate max_mem_nodes dimension */
if (mnode == 0) {
int i;
/* page_cachelists */
sizeof (page_t **);
/* page_freelists */
sizeof (page_t **);
/* fpc_mutex and cpc_mutex */
if (alloc_base == NULL)
return (-1);
(page_t ***)alloc_base;
alloc_base += (max_mem_nodes *
sizeof (page_t **));
}
}
for (i = 0; i < NPC_MUTEX; i++) {
}
alloc_sz = 0;
}
/*
* Calculate the size needed by alloc_page_freelists().
*/
}
if (alloc_base == NULL)
return (-1);
return (0);
}
/*
* To select our starting bin, we stride through the bins with a stride
* of 337. Why 337? It's prime, it's largeish, and it performs well both
* in simulation and practice for different workloads on varying cache sizes.
*/
int color_start_random = 0;
/* ARGSUSED */
{
}
do {
}
/*
* Called once at startup from kphysm_init() -- before memialloc()
* is invoked to do the 1st page_free()/page_freelist_add().
*
* initializes page_colors and page_colors_mask based on ecache_setsize.
*
* Also initializes the counter locks.
*/
void
{
int a;
if (do_pg_coloring == 0) {
page_colors = 1;
return;
}
/*
* Calculate page_colors from ecache_setsize. ecache_setsize contains
* the max ecache setsize of all cpus configured in the system or, for
* cheetah+ systems, the max possible ecache setsize for all possible
* cheetah+ cpus.
*/
/*
* initialize cpu_page_colors if ecache setsizes are homogenous.
* cpu_page_colors set to -1 during DR operation or during startup
* if setsizes are heterogenous.
*
* The value of cpu_page_colors determines if additional color bins
* need to be checked for a particular color in the page_get routines.
*/
page_coloring_shift = 0;
a = ecache_setsize;
while (a >>= 1) {
}
}
int
{
int color = -1;
if (vac) {
}
}
}
/*
* Create & Initialise pageout scanner thread. The thread has to
* start at procedure with process pp and priority pri.
*/
void
{
}
/*
* Function for flushing D-cache when performing module relocations
* to an alternate mapping. Stubbed out on all platforms except sun4u,
* at least for now.
*/
void
{
}
static int
{
return (0);
return (0);
return (1);
}
/*
* Return the number of bytes, relative to the beginning of a given range, that
* are non-toxic (can be read from and written to with relative impunity).
*/
{
/* OBP reads are harmless, but we don't want people writing there */
OFW_START_ADDR + 1))
return (sz); /* no overlap */
}
/*
* Minimum physmem required for enabling large pages for kernel heap
* Currently we do not enable lp for kmem on systems with less
*/
/*
* this function chooses large page size for kernel heap
*/
{
extern int disable_large_pages;
if (memtotal < segkmem_lpminphysmem)
return (PAGESIZE);
if (plat_lpkmem_is_supported != NULL &&
plat_lpkmem_is_supported() == 0)
return (PAGESIZE);
while (szc) {
return (page_get_pagesize(szc));
szc--;
}
return (PAGESIZE);
}