vm_machdep.c revision e4ab3d6da56d08ba5dd5d99400d5963b44c4d3a3
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* Portions of this source code were derived from Berkeley 4.3 BSD
* under license from the Regents of the University of California.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* UNIX machine dependent virtual memory support.
*/
#include <vm/seg_kmem.h>
#include <sys/vm_machparam.h>
#include <sys/x86_archext.h>
#include <sys/archsystm.h>
#include <sys/machsystm.h>
#include <sys/ddidmareq.h>
#ifdef __xpv
#include <sys/hypervisor.h>
#include <sys/balloon_impl.h>
/*
* domain 0 pages usable for DMA are kept pre-allocated and kept in
* distinct lists, ordered by increasing mfn.
*/
static kmutex_t io_pool_lock;
static kmutex_t contig_list_lock;
static long io_pool_cnt;
static long io_pool_cnt_max = 0;
#define DEFAULT_IO_POOL_MIN 128
static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
static long io_pool_cnt_lowater = 0;
static long io_pool_shrink_attempts; /* how many times did we try to shrink */
static long io_pool_shrinks; /* how many times did we really shrink */
static long io_pool_grows; /* how many times did we grow */
static int create_contig_pfnlist(uint_t);
/*
* percentage of phys mem to hold in the i/o pool
*/
#define DEFAULT_IO_POOL_PCT 2
static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
int ioalloc_dbg = 0;
#endif /* __xpv */
int largepagesupport = 0;
extern uint_t page_create_new;
extern uint_t page_create_exists;
extern uint_t page_create_putbacks;
extern uint_t page_create_putbacks;
/*
* Allow users to disable the kernel's use of SSE.
*/
extern int use_sse_pagecopy, use_sse_pagezero;
/*
* combined memory ranges from mnode and memranges[] to manage single
*/
typedef struct {
int mnr_mnode;
int mnr_memrange; /* index into memranges[] */
/* maintain page list stats */
#ifdef DEBUG
int mnr_mts_colors;
} *mnr_mts;
#endif
} mnoderange_t;
#define MEMRANGEHI(mtype) \
/*
* As the PC architecture evolved memory up was clumped into several
* ranges for various historical I/O devices to do DMA.
* < 16Meg - ISA bus
* < 2Gig - ???
* < 4Gig - PCI bus or drivers that don't understand PAE mode
*
* These are listed in reverse order, so that we can skip over unused
* ranges on machines with small memories.
*
* For now under the Hypervisor, we'll only ever have one memrange.
*/
#define PFN_4GIG 0x100000
#define PFN_16MEG 0x1000
PFN_4GIG, /* pfn range for 4G and above */
0x80000, /* pfn range for 2G-4G */
PFN_16MEG, /* pfn range for 16M-2G */
0x00000, /* pfn range for 0-16M */
};
int nranges = NUM_MEM_RANGES;
/*
* This combines mem_node_config and memranges into one data
* structure to be used for page list management.
*/
int mnoderangecnt;
int mtype4g;
/*
* 4g memory management variables for systems with more than 4g of memory:
*
* physical memory below 4g is required for 32bit dma devices and, currently,
* for kmem memory. On systems with more than 4g of memory, the pool of memory
* below 4g can be depleted without any paging activity given that there is
* likely to be sufficient memory above 4g.
*
* physmax4g is set true if the largest pfn is over 4g. The rest of the
* 4g memory management code is enabled only when physmax4g is true.
*
* maxmem4g is the count of the maximum number of pages on the page lists
* with physical addresses below 4g. It can be a lot less then 4g given that
* BIOS may reserve large chunks of space below 4g for hot plug pci devices,
* agp aperture etc.
*
* freemem4g maintains the count of the number of available pages on the
* page lists with physical addresses below 4g.
*
* DESFREE4G specifies the desired amount of below 4g memory. It defaults to
* 6% (desfree4gshift = 4) of maxmem4g.
*
* RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
* and the amount of physical memory above 4g is greater than freemem4g.
* In this case, page_get_* routines will restrict below 4g allocations
* for requests that don't specifically require it.
*/
#define RESTRICT4G_ALLOC \
static int physmax4g;
static int lotsfree4gshift = 3;
/*
* 16m memory management:
*
* reserve some amount of physical memory below 16m for legacy devices.
*
* RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
* 16m or if the 16m pool drops below DESFREE16M.
*
* In this case, general page allocations via page_get_{free,cache}list
* routines will be restricted from allocating from the 16m pool. Allocations
* that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
* are not restricted.
*/
#define FREEMEM16M MTYPE_FREEMEM(0)
#define DESFREE16M desfree16m
((freemem >= (FREEMEM16M)) || \
/*
* drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
*/
int restricted_kmemalloc = 0;
#ifdef VM_STATS
struct {
} pga_vmstats;
#endif
/* How many page sizes the users can see */
/* page sizes that legacy applications can see */
/*
* Number of pages in 1 GB. Don't enable automatic large pages if we have
* fewer than this many pages.
*/
/*
* Maximum and default segment size tunables for user private
* and shared anon memory, and user text and initialized data.
* to be used for mapping application private and shared anon memory.
*/
/*
* initialized by page_coloring_init().
*/
int cpu_page_colors;
/*
* Page freelists and cachelists are dynamically allocated once mnoderangecnt
* and page_colors are calculated from the l2 cache n-way set size. Within a
* mnode range, the page freelist and cachelist are hashed into bins based on
* color. This makes it easier to search for a page within a specific memory
* range.
*/
#define PAGE_COLORS_MIN 16
page_t ****page_freelists;
/*
* Used by page layer to know about page sizes
*/
/*
* Only let one thread at a time try to coalesce large pages, to
* prevent them from working against each other.
*/
static kmutex_t contig_lock;
/*
* Return the optimum page size for a given mapping
*/
/*ARGSUSED*/
{
level_t l = 0;
return (MMU_PAGESIZE);
}
switch (maptype) {
case MAPPGSZ_HEAP:
case MAPPGSZ_STK:
if (max_lpsize == MMU_PAGESIZE) {
return (MMU_PAGESIZE);
}
if (len == 0) {
}
/*
* use the pages size that best fits len
*/
for (l = mmu.umax_page_level; l > 0; --l) {
continue;
} else {
pgsz = LEVEL_SIZE(l);
}
break;
}
p->p_stkpageszc);
}
return (pgsz);
case MAPPGSZ_ISM:
for (l = mmu.umax_page_level; l > 0; --l) {
if (len >= LEVEL_SIZE(l))
return (LEVEL_SIZE(l));
}
return (LEVEL_SIZE(0));
}
return (pgsz);
}
static uint_t
{
int i;
return (0);
}
for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
pgsz = page_get_pagesize(i);
if (pgsz > max_lpsize) {
continue;
}
continue;
}
continue;
}
/*
* Set szcvec to the remaining page sizes.
*/
break;
}
return (szcvec);
}
/*
* Return a bit vector of large page size codes that
* can be used to map [addr, addr + len) region.
*/
/*ARGSUSED*/
int memcntl)
{
if (mmu.max_page_level == 0)
return (0);
if (!memcntl)
} else if (flags & MAP_INITDATA) {
if (!memcntl)
} else if (type == MAPPGSZC_SHM) {
if (!memcntl)
} else if (type == MAPPGSZC_HEAP) {
if (!memcntl)
} else if (type == MAPPGSZC_STACK) {
if (!memcntl)
} else {
if (!memcntl)
}
}
/*
* Handle a pagefault.
*/
enum fault_type type,
int iskernel)
{
struct proc *p;
kthread_t *t;
int err;
int mapped_red;
if (INVALID_VADDR(addr))
return (FC_NOMAP);
mapped_red = segkp_map_red();
if (iskernel) {
} else {
t = curthread;
p = ttoproc(t);
}
/*
* Dispatch pagefault.
*/
/*
* If this isn't a potential unmapped hole in the user's
* UNIX data or stack segments, just return status info.
*/
goto out;
/*
* Check to see if we happened to faulted on a currently unmapped
* part of the UNIX data or stack segments. If so, create a zfod
* mapping there and then try calling the fault routine again.
*/
/* not in either UNIX data or stack segments */
goto out;
}
}
/*
* the rest of this function implements a 3.X 4.X 5.X compatibility
* This code is probably not needed anymore
*/
if (p->p_model == DATAMODEL_ILP32) {
/* expand the gap to the page boundaries on each side */
0) {
if (err) {
goto out;
}
} else {
/*
* This page is already mapped by another thread after
* we returned from as_fault() above. We just fall
* through as_fault() below.
*/
}
}
out:
if (mapped_red)
return (res);
}
void
{
}
/*ARGSUSED*/
int
{
return (0);
}
/*
* map_addr_proc() is the routine called when the system is to
* choose an address for the user. We will pick an address
* range which is the highest available below userlimit.
*
* Every mapping will have a redzone of a single page on either side of
* the request. This is done to leave one page unmapped between segments.
* This is not required, but it's useful for the user because if their
* program strays across a segment boundary, it will catch a fault
* immediately making debugging a little easier. Currently the redzone
* is mandatory.
*
* On input it is a hint from the user to be used in a completely
* machine dependent fashion. We decide to completely ignore this hint.
* If MAP_ALIGN was specified, addrp contains the minimal alignment, which
* must be some "power of two" multiple of pagesize.
*
* On output it is NULL if no address can be found in the current
* processes address space or else an address that is currently
* not mapped for len bytes with a page of red zone on either side.
*
* vacalign is not needed on x86 (it's for viturally addressed caches)
*/
/*ARGSUSED*/
void
int vacalign,
struct proc *p,
{
#if defined(__amd64)
/*
* XX64 Yes, this needs more work.
*/
if (p->p_model == DATAMODEL_NATIVE) {
/*
* This happens when a program wants to map
* something in a range that's accessible to a
* program in a smaller address space. For example,
* a 64-bit program calling mmap32(2) to guarantee
* that the returned address is below 4Gbytes.
*/
else {
return;
}
} else {
/*
* XX64 This layout is probably wrong .. but in
* the event we make the amd64 address space look
* like sparcv9 i.e. with the stack -above- the
* heap, this bit of code might even be correct.
*/
}
} else
#endif
/* Make len be a multiple of PAGESIZE */
/*
* figure out what the alignment should be
*
* XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
*/
if (len <= ELF_386_MAXPGSZ) {
/*
* Align virtual addresses to ensure that ELF shared libraries
* are mapped with the appropriate alignment constraints by
* the run-time linker.
*/
} else {
int l = mmu.umax_page_level;
while (l && len < LEVEL_SIZE(l))
--l;
align_amount = LEVEL_SIZE(l);
}
/*
* Look for a large enough hole starting below userlimit.
* After finding it, use the upper part.
*/
/*
* addr is the highest possible address to use since we have
* a PAGESIZE redzone at the beginning and end.
*/
/*
* Round address DOWN to the alignment amount and
* add the offset in.
* If addr is greater than as_addr, len would not be large
* enough to include the redzone, so we must adjust down
* by the alignment amount.
*/
addr -= align_amount;
}
} else {
}
}
/*
* Determine whether [*basep, *basep + *lenp) contains a mappable range of
* addresses at least "minlen" long, where the base of the range is at "off"
* phase from an "align" boundary and there is space for a "redzone"-sized
* redzone on either side of the range. On success, 1 is returned and *basep
* and *lenp are adjusted to describe the acceptable range (including
* the redzone). On failure, 0 is returned.
*/
/*ARGSUSED3*/
int
{
/*
* If hi rolled over the top, try cutting back.
*/
/* See if this really happens. If so, then we figure out why */
}
return (0);
}
#if defined(__amd64)
/*
* Deal with a possible hole in the address range between
* hole_start and hole_end that should never be mapped.
*/
if (lo < hole_start) {
if (hi > hole_start) {
hi = hole_start;
} else {
/* lo < hole_start && hi >= hole_end */
/*
* prefer lowest range
*/
hi = hole_start;
else
return (0);
} else {
/*
* prefer highest range
*/
hi = hole_start;
else
return (0);
}
}
}
} else {
/* lo >= hole_start */
return (0);
}
#endif
return (0);
if (align > 1) {
return (0);
}
return (0);
}
}
return (1);
}
/*
* Determine whether [*basep, *basep + *lenp) contains a mappable range of
* addresses at least "minlen" long. On success, 1 is returned and *basep
* and *lenp are adjusted to describe the acceptable range. On failure, 0
* is returned.
*/
int
{
}
/*
* Determine whether [addr, addr+len] are valid user addresses.
*/
/*ARGSUSED*/
int
{
return (RANGE_BADADDR);
#if defined(__amd64)
/*
* Check for the VA hole
*/
return (RANGE_BADADDR);
#endif
return (RANGE_OKAY);
}
/*
* Return 1 if the page frame is onboard memory, else 0.
*/
int
{
if (pfn_is_foreign(pf))
return (0);
}
/*
* return the memrange containing pfn
*/
int
{
int n;
for (n = 0; n < nranges - 1; ++n) {
break;
}
return (n);
}
/*
* return the mnoderange containing pfn
*/
/*ARGSUSED*/
int
{
#if defined(__xpv)
return (0);
#else
int n;
for (n = mnoderangecnt - 1; n >= 0; n--) {
break;
}
}
return (n);
#endif
}
#if !defined(__xpv)
/*
* is_contigpage_free:
* returns a page list of contiguous pages. It minimally has to return
* minctg pages. Caller determines minctg based on the scatter-gather
* list length.
*
* pfnp is set to the next page frame to search on return.
*/
static page_t *
int iolock)
{
int i = 0;
/*
* fail if pfn + minctg crosses a segment boundary.
* Adjust for next starting pfn to begin at segment boundary.
*/
return (NULL);
}
do {
(*pfnp)++;
break;
}
goto retry;
}
(*pfnp)++;
break;
}
} else {
}
if (iolock)
/*
* exit loop when pgcnt satisfied or segment boundary reached.
*/
*pfnp += i; /* set to next pfn to search */
if (i >= minctg) {
*pgcnt -= i;
return (plist);
}
/*
* failure: minctg not satisfied.
*
* if next request crosses segment boundary, set next pfn
* to search from the segment boundary.
*/
/* clean up any pages already allocated */
while (plist) {
if (iolock)
}
return (NULL);
}
#endif /* !__xpv */
/*
* verify that pages being returned from allocator have correct DMA attribute
*/
#ifndef DEBUG
#define check_dma(a, b, c) (0)
#else
static void
{
return;
while (cnt-- > 0) {
}
}
#endif
#if !defined(__xpv)
static page_t *
{
int sgllen;
static pgcnt_t lastctgcnt;
CONTIG_LOCK();
if (mattr) {
if (align > MMU_PAGESIZE)
/*
* in order to satisfy the request, must minimally
* acquire minctg contiguous pages
*/
/*
* start from where last searched if the minctg >= lastctgcnt
*/
} else {
lo = 0;
sgllen = 1;
if (minctg < lastctgcnt)
}
lastctgcnt = minctg;
/* conserve 16m memory - start search above 16m when possible */
if (pfnalign)
if (plist) {
sgllen--;
/*
* return when contig pages no longer needed
*/
return (pplist);
}
}
if (pfnalign)
}
/* cannot find contig pages in specified range */
return (NULL);
}
/* did not start with lo previously */
if (pfnalign)
/* allow search to go above startpfn */
sgllen--;
/*
* return when contig pages no longer needed
*/
return (pplist);
}
}
if (pfnalign)
}
return (NULL);
}
#endif /* !__xpv */
/*
* mnode_range_cnt() calculates the number of memory ranges for mnode and
* memranges[]. Used to determine the size of page lists and mnoderanges.
*/
int
mnode_range_cnt(int mnode)
{
#if defined(__xpv)
return (1);
#else /* __xpv */
int mri;
int mnrcnt = 0;
/* find the memranges index below contained in mnode range */
mri--;
/*
* increment mnode range counter when memranges or mnode
* boundary is reached.
*/
while (mri >= 0 &&
mnrcnt++;
mri--;
else
break;
}
}
return (mnrcnt);
#endif /* __xpv */
}
/*
* mnode_range_setup() initializes mnoderanges.
*/
void
{
continue;
mri--;
MEMRANGELO(mri)) {
mnoderanges++;
mri--;
else
break;
}
}
}
/*ARGSUSED*/
int
{
#if !defined(__xpv)
#if defined(__i386)
/*
* set the mtype range
* - kmem requests needs to be below 4g if restricted_kmemalloc is set.
* - for non kmem requests, set range to above 4g if memory below 4g
* runs low.
*/
*flags |= PGI_MT_RANGE16M;
} else {
*flags |= PGI_MT_RANGE0;
}
return (mtype);
}
#endif /* __i386 */
if (RESTRICT4G_ALLOC) {
/* here only for > 4g systems */
*flags |= PGI_MT_RANGE4G;
*flags |= PGI_MT_RANGE16M;
} else {
*flags |= PGI_MT_RANGE0;
}
#endif /* !__xpv */
return (mtype);
}
/* mtype init for page_get_replacement_page */
/*ARGSUSED*/
int
{
#if !defined(__ixpv)
*flags |= PGI_MT_RANGE16M;
} else {
*flags |= PGI_MT_RANGE0;
}
#endif
return (mtype);
}
/*
* Determine if the mnode range specified in mtype contains memory belonging
* to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains
* the range of indices from high pfn to 0, 16m or 4g.
*
* Return first mnode range type index found otherwise return -1 if none found.
*/
int
{
if (flags & PGI_MT_RANGE) {
int mtlim = 0;
if (flags & PGI_MT_NEXT)
mtype--;
if (flags & PGI_MT_RANGE4G)
else if (flags & PGI_MT_RANGE16M)
return (mtype);
mtype--;
}
return (mtype);
}
return (-1);
}
/*
* Update the page list max counts with the pfn range specified by the
* input parameters. Called from add_physmem() when physical memory with
* page_t's are initially added to the page lists.
*/
void
{
int mtype = 0;
if (!physmax4g)
return;
} else {
}
}
mtype++;
}
}
int
mtype_2_mrange(int mtype)
{
}
void
{
}
{
#ifdef DEBUG
}
#endif
return (ctrs_sz);
}
{
#ifdef DEBUG
}
}
#endif
return (addr);
}
void
{
#ifdef DEBUG
cnt);
#endif
if (flags & PG_CACHE_LIST)
else
}
/*
* Returns the free page count for mnode
*/
int
mnode_pgcnt(int mnode)
{
int flags = PGI_MT_RANGE0;
while (mtype != -1) {
}
return (pgcnt);
}
/*
* Initialize page coloring variables based on the l2 cache parameters.
* Calculate and return memory needed for page coloring data structures.
*/
{
int i;
int colors;
#if defined(__xpv)
/*
* Hypervisor domains currently don't have any concept of NUMA.
* Hence we'll act like there is only 1 memrange.
*/
i = memrange_num(1);
#else /* !__xpv */
/*
* Reduce the memory ranges lists if we don't have large amounts
* of memory. This avoids searching known empty free lists.
*/
i = memrange_num(physmax);
#if defined(__i386)
if (i > 0)
restricted_kmemalloc = 0;
#endif
/* physmax greater than 4g */
if (i == 0)
physmax4g = 1;
#endif /* !__xpv */
memranges += i;
nranges -= i;
/* l2_assoc is 0 for fully associative l2 cache */
if (l2_assoc)
else
l2_colors = 1;
/* for scalability, configure at least PAGE_COLORS_MIN color bins */
/*
* cpu_page_colors is non-zero when a page color may be spread across
* multiple bins.
*/
if (l2_colors < page_colors)
/* initialize number of colors per page size */
for (i = 0; i <= mmu.max_page_level; i++) {
+ 1;
colorequivszc[i] = 0;
}
/*
* The value of cpu_page_colors determines if additional color bins
* need to be checked for a particular color in the page_get routines.
*/
if (cpu_page_colors != 0) {
ASSERT(a > 0);
ASSERT(a < 16);
for (i = 0; i <= mmu.max_page_level; i++) {
colorequivszc[i] = 0;
continue;
}
while ((colors >> a) == 0)
a--;
ASSERT(a >= 0);
/* higher 4 bits encodes color equiv mask */
colorequivszc[i] = (a << 4);
}
}
/* factor in colorequiv to check additional 'equivalent' bins. */
if (colorequiv > 1) {
if (a > 15)
a = 15;
for (i = 0; i <= mmu.max_page_level; i++) {
continue;
}
while ((colors >> a) == 0)
a--;
if ((a << 4) > colorequivszc[i]) {
colorequivszc[i] = (a << 4);
}
}
}
/* size for mnoderanges */
for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
mnoderangecnt += mnode_range_cnt(i);
/* size for fpc_mutex and cpc_mutex */
/* size of page_freelists */
for (i = 0; i < mmu_page_sizes; i++) {
colors = page_get_pagecolors(i);
}
/* size of page_cachelists */
return (colorsz);
}
/*
* Called once at startup to configure page_coloring data structures and
* does the 1st page_free()/page_freelist_add().
*/
void
{
int i;
int j;
int k;
int colors;
/*
* do page coloring setup
*/
if (physmax4g)
for (k = 0; k < NPC_MUTEX; k++) {
}
for (k = 0; k < NPC_MUTEX; k++) {
}
for (i = 0; i < mnoderangecnt; i++) {
for (j = 0; j < mmu_page_sizes; j++) {
colors = page_get_pagecolors(j);
}
}
}
#if defined(__xpv)
/*
* Give back 10% of the io_pool pages to the free list.
* Don't shrink the pool below some absolute minimum.
*/
static void
{
int retcnt;
int bothpools = 0;
io_pool_shrink_attempts++; /* should be a kstat? */
if (retcnt <= 0)
goto done;
io_pool_shrinks++; /* should be a kstat? */
curpool = &io_pool_4g;
/*
* Loop through taking pages from the end of the list
* (highest mfns) till amount to return reached.
*/
break;
retcnt--;
io_pool_cnt--;
}
/*
* If not enough found in less constrained pool try the
* more constrained one.
*/
curpool = &io_pool_16m;
bothpools = 1;
goto domore;
}
done:
}
#endif /* __xpv */
{
#if defined(__xpv)
/*
* Check this is an urgent allocation and free pages are depleted.
*/
#else /* !__xpv */
/*
* page_create_get_something may call this because 4g memory may be
* depleted. Set flags to allow for relocation of base page below
* 4g if necessary.
*/
if (physmax4g)
#endif /* __xpv */
return (flags);
}
/*ARGSUSED*/
int
{
return (0);
}
#if defined(__xpv)
/*
* Take pages out of an io_pool
*/
static void
{
}
}
/*
* Put a page on the io_pool list. The list is ordered by increasing MFN.
*/
static void
{
return;
}
/*
* Since we try to take pages from the high end of the pool
* chances are good that the pages to be put on the list will
* go at or near the end of the list. so start at the end and
* work backwards.
*/
break; /* backed all the way to front of list */
}
/* insert after look */
/*
* we inserted a new first list element
* adjust pool pointer to newly inserted element
*/
}
}
/*
* Add a page to the io_pool. Setting the force flag will force the page
* into the io_pool no matter what.
*/
static void
{
/*
* Always keep the scarce low memory pages
*/
++io_pool_cnt;
goto done;
}
++io_pool_cnt;
} else {
} else {
}
}
done:
if (freep)
}
int contig_pfn_cnt; /* no of pfns in the contig pfn list */
int contig_pfn_max; /* capacity of the contig pfn list */
int next_alloc_pfn; /* next position in list to start a contig search */
int contig_pfnlist_updates; /* pfn list update count */
int contig_pfnlist_builds; /* how many times have we (re)built list */
int contig_pfnlist_buildfailed; /* how many times has list build failed */
int create_contig_pending; /* nonzero means taskq creating contig list */
/*
* Function to use in sorting a list of pfns by their underlying mfns.
*/
static int
{
return (1);
return (-1);
return (0);
}
/*
* Compact the contig_pfn_list by tossing all the non-contiguous
* elements from the list.
*/
static void
compact_contig_pfn_list(void)
{
int i, newcnt = 0;
prev_lapfn = 0;
for (i = 0; i < contig_pfn_cnt - 1; i++) {
pfn = contig_pfn_list[i];
/*
* See if next pfn is for a contig mfn
*/
continue;
/*
* pfn and lookahead are both put in list
* unless pfn is the previous lookahead.
*/
if (pfn != prev_lapfn)
prev_lapfn = lapfn;
}
for (i = newcnt; i < contig_pfn_cnt; i++)
contig_pfn_list[i] = 0;
}
/*ARGSUSED*/
static void
call_create_contiglist(void *arg)
{
(void) create_contig_pfnlist(PG_WAIT);
}
/*
* Create list of freelist pfns that have underlying
* contiguous mfns. The list is kept in ascending mfn order.
* returns 1 if list created else 0.
*/
static int
{
int ret = 1;
if (contig_pfn_list != NULL)
goto out;
if (contig_pfn_list == NULL) {
/*
* If we could not create the contig list (because
* we could not sleep for memory). Dispatch a taskq that can
* sleep to get the memory.
*/
if (!create_contig_pending) {
}
contig_pfnlist_buildfailed++; /* count list build failures */
ret = 0;
goto out;
}
ASSERT(contig_pfn_cnt == 0);
continue;
if (++contig_pfn_cnt == contig_pfn_max)
break;
}
/*
* Make sure next search of the newly created contiguous pfn
* list starts at the beginning of the list.
*/
next_alloc_pfn = 0;
contig_pfnlist_builds++; /* count list builds */
out:
return (ret);
}
/*
* Toss the current contig pfnlist. Someone is about to do a massive
* update to pfn<->mfn mappings. So we have them destroy the list and lock
* it till they are done with their update.
*/
void
{
if (contig_pfn_list != NULL) {
contig_pfn_max = contig_pfn_cnt = 0;
}
}
/*
* Unlock the contig_pfn_list. The next attempted use of it will cause
* it to be re-created.
*/
void
{
}
/*
* Update the contiguous pfn list in response to a pfn <-> mfn reassignment
*/
void
{
int drop_lock = 0;
drop_lock = 1;
}
if (contig_pfn_list == NULL)
goto done;
/*
* Find the pfn in the current list. Use a binary chop to locate it.
*/
probe_lo = 0;
probe_pos = -1;
break;
}
else
}
if (probe_pos >= 0) { /* remove pfn fom list */
}
if (newmfn == MFN_INVALID)
goto done;
/*
* Check if new mfn has adjacent mfns in the list
*/
probe_lo = 0;
insert_after = -2;
do {
break;
else
} while (insert_after == -2);
/*
* If there is space in the list and there are adjacent mfns
* insert the pfn in to its proper place in the list.
*/
}
done:
if (drop_lock)
}
/*
* Called to (re-)populate the io_pool from the free page lists.
*/
long
populate_io_pool(void)
{
/*
* Figure out the bounds of the pool on first invocation.
* We use a percentage of memory for the io pool size.
* we allow that to shrink, but not to less than a fixed minimum
*/
if (io_pool_cnt_max == 0) {
/*
* This is the first time in populate_io_pool, grab a va to use
* when we need to allocate pages.
*/
}
/*
* If we are out of pages in the pool, then grow the size of the pool
*/
if (io_pool_cnt == 0) {
/*
* Grow the max size of the io pool by 5%, but never more than
* 25% of physical memory.
*/
}
io_pool_grows++; /* should be a kstat? */
/*
* Get highest mfn on this platform, but limit to the 32 bit DMA max.
*/
(void) mfn_to_pfn(start_mfn);
if (pfn & PFN_IS_FOREIGN_MFN)
continue;
/*
* try to allocate it from free pages
*/
continue;
PP_CLRFREE(pp);
if (io_pool_cnt >= io_pool_cnt_max)
break;
}
return (io_pool_cnt);
}
/*
* Destroy a page that was being used for DMA I/O. It may or
* may not actually go back to the io_pool.
*/
void
{
/*
* When the page was alloc'd a reservation was made, release it now
*/
page_unresv(1);
/*
* Unload translations, if any, then hash out the
* page to erase its identity.
*/
/*
* If the page came from the free lists, just put it back to them.
* DomU pages always go on the free lists as well.
*/
return;
}
add_page_to_pool(pp, 0);
}
long contig_searches; /* count of times contig pages requested */
long contig_search_restarts; /* count of contig ranges tried */
long contig_search_failed; /* count of contig alloc failures */
/*
* Look thru the contiguous pfns that are not part of the io_pool for
* contiguous free pages. Return a list of the found pages or NULL.
*/
page_t *
{
int pages_needed, pages_requested;
int search_start;
/*
* create the contig pfn list if not already done
*/
if (contig_pfn_list == NULL) {
if (!create_contig_pfnlist(flags)) {
return (NULL);
}
goto retry;
}
/*
* Search contiguous pfn list for physically contiguous pages not in
* the io_pool. Start the search where the last search left off.
*/
while (pages_needed) {
/*
* Check if mfn is first one or contig to previous one and
* if page corresponding to mfn is free and that mfn
* range is not crossing a segment boundary.
*/
PP_CLRFREE(pp);
pages_needed--;
if (prev_mfn == 0)
} else {
/*
* free partial page list
*/
}
}
if (++next_alloc_pfn == contig_pfn_cnt)
next_alloc_pfn = 0;
if (next_alloc_pfn == search_start)
break; /* all pfns searched */
}
if (pages_needed) {
/*
* Failed to find enough contig pages.
* free partial page list
*/
}
}
return (plist);
}
/*
* Search the reserved io pool pages for a page range with the
* desired characteristics.
*/
page_t *
{
if (minctg == 1)
contig = 0;
if (align > MMU_PAGESIZE)
else
pfnalign = 0;
/*
* See if we want pages for a legacy device
*/
poolp = &io_pool_16m;
else
poolp = &io_pool_4g;
/*
* Take pages from I/O pool. We'll use pages from the highest
* MFN range possible.
*/
/*
* skip pages above allowable range
*/
goto skip;
/*
* stop at pages below allowable range
*/
break;
/*
* Check alignment
*/
goto skip; /* not properly aligned */
/*
* Check segment
*/
goto skip; /* crosses seg boundary */
/*
* Start building page list
*/
nwanted--;
} else {
/*
* check physical contiguity if required
*/
if (contig &&
/*
* not a contiguous page, restart list.
*/
goto restart;
} else { /* add page to list */
nwanted--;
}
}
skip:
break;
}
/*
* If we didn't find memory. Try the more constrained pool, then
* sweep free pages into the DMA pool and try again.
*/
if (nwanted != 0) {
/*
* If we were looking in the less constrained pool and
* didn't find pages, try the more constrained pool.
*/
if (poolp == &io_pool_4g) {
poolp = &io_pool_16m;
goto try_smaller;
}
kmem_reap();
if (++attempt < 4) {
/*
* Grab some more io_pool pages
*/
(void) populate_io_pool();
goto try_again; /* go around and retry */
}
return (NULL);
}
/*
* Found the pages, now snip them from the list
*/
io_pool_cnt -= minctg;
/*
* reset low water mark
*/
if (io_pool_cnt < io_pool_cnt_lowater)
return (pp_first);
}
page_t *
{
if (minctg == 1)
contig = 0;
flags &= ~PG_PHYSCONTIG;
/*
* Hypervisor will allocate extents, if we want contig
* pages extent must be >= minctg
*/
if (contig) {
order++;
} else {
order = 0;
}
return (NULL);
}
goto balloon_fail;
goto balloon_fail;
goto balloon_fail;
/*
* fill out the rest of extent pages to swap
* with the hypervisor
*/
for (i = 0; i < extra; i++) {
goto balloon_fail;
/*
* add page to end of list
*/
}
}
for (i = 0; i < extpages; i++) {
}
if (ioalloc_dbg)
goto balloon_fail;
}
/*
* Return any excess pages to free list
*/
for (i = 0; i < extra; i++) {
page_unresv(1);
}
}
return (pp_first);
/*
* Return pages to free list and return failure
*/
}
if (pplist)
if (mfnlist)
return (NULL);
}
static void
{
}
}
static page_t *
int *npagesp,
{
if (align > MMU_PAGESIZE)
/*
* Clear the contig flag if only one page is needed.
*/
if (npages == 1) {
getone = 1;
contig = 0;
}
/*
* Check if any page in the system is fine.
*/
flags &= ~PG_PHYSCONTIG;
*npagesp = 0;
return (plist);
}
}
/*
* We could just want unconstrained but contig pages.
*/
/*
* Look for free contig pages to satisfy the request.
*/
}
/*
* Try the reserved io pools next
*/
do {
panic("page_get_contigpages:"
" hashin failed"
" pp %p, vp %p, off %llx",
}
off += MMU_PAGESIZE;
PP_CLRFREE(pp);
PP_CLRAGED(pp);
} else {
/*
* Hypervisor exchange doesn't handle segment or
* alignment constraints
*/
goto fail;
/*
* Try exchanging pages with the hypervisor
*/
goto fail;
}
/*
* Here with a minctg run of contiguous pages, add them to the
* list we will return for this request.
*/
sgllen--;
if (getone)
break;
}
return (plist);
fail:
return (NULL);
}
/*
* Allocator for domain 0 I/O pages. We match the required
* DMA attributes and contiguity constraints.
*/
/*ARGSUSED*/
page_t *
{
int align;
int is_domu = 0;
if (align > MMU_PAGESIZE)
/*
* Clear the contig flag if only one page is needed or the scatter
* gather list length is >= npages.
*/
contig = 0;
/*
* Check if any old page in the system is fine.
* DomU should always go down this path.
*/
flags &= ~PG_PHYSCONTIG;
return (plist);
else if (is_domu)
return (NULL); /* no memory available */
}
/*
* DomU should never reach here
*/
if (contig) {
mattr);
goto fail;
/*
* We now have all the contiguous pages we need, but
* we may still need additional non-contiguous pages.
*/
}
/*
* now loop collecting the requested number of pages, these do
* not have to be contiguous pages but we will use the contig
* page alloc code to get the pages since it will honor any
* other constraints the pages may have.
*/
while (npages--) {
dummy = 1;
goto fail;
vaddr += MMU_PAGESIZE;
off += MMU_PAGESIZE;
}
return (plist);
fail:
/*
* Failed to get enough pages, return ones we did get
*/
return (NULL);
}
/*
* Lock and return the page with the highest mfn that we can find. last_mfn
* holds the last one found, so the next search can start from there. We
* also keep a counter so that we don't loop forever if the machine has no
* free pages.
*
* This is called from the balloon thread to find pages to give away. new_high
* is used when new mfn's have been added to the system - we will reset our
* search if the new mfn's are higher than our current search position.
*/
page_t *
{
ulong_t loop_count = 0;
if (last_mfn == 0) {
}
if (pfn & PFN_IS_FOREIGN_MFN)
continue;
/* See if the page is free. If so, lock it. */
continue;
PP_CLRFREE(pp);
last_mfn--;
return (pp);
}
return (NULL);
}
#else /* !__xpv */
/*
* get a page from any list with the given mnode
*/
static page_t *
{
int i;
int mtypestart;
int plw_initialized;
if (mtype < 0) {
return (NULL);
}
mtypestart = mtype;
/*
* check up to page_colors + 1 bins - origbin may be checked twice
* because of BIN_STEP skip
*/
do {
plw_initialized = 0;
goto nextfreebin;
}
continue;
}
/* check if page within DMA attributes */
dma_attr->dma_attr_addr_hi)) {
break;
}
/* continue looking */
}
/* found a page with specified DMA attributes */
(void *)pp);
}
return (pp);
}
if (plw_initialized == 0) {
plw_initialized = 1;
}
if (plw.plw_do_split) {
&plw);
return (pp);
}
}
} while (mtype >= 0);
/* failed to find a page in the freelist; try it in the cachelist */
/* reset mtype start for cachelist search */
mtype = mtypestart;
/* start with the bin of matching color */
do {
for (i = 0; i <= page_colors; i++) {
goto nextcachebin;
break;
continue;
}
/* check if page within DMA attributes */
dma_attr->dma_attr_addr_hi)) {
break;
}
/* continue looking */
}
/* found a page with specified DMA attributes */
return (pp);
}
bin &= page_colors_mask;
}
} while (mtype >= 0);
return (NULL);
}
/*
* This function is similar to page_get_freelist()/page_get_cachelist()
* but it searches both the lists to find a page with the specified
* color (or no color) and DMA attributes. The search is done in the
* freelist first and then in the cache list within the highest memory
* range (based on DMA attributes) before searching in the lower
* memory ranges.
*
* Note: This function is called only by page_create_io().
*/
/*ARGSUSED*/
static page_t *
{
int mtype;
int n;
int m;
int szc;
int fullrange;
int mnode;
int local_failed_stat = 0;
/* only base pagesize currently supported */
if (size != MMU_PAGESIZE)
return (NULL);
/*
* If we're passed a specific lgroup, we use it. Otherwise,
* assume first-touch placement is desired.
*/
if (!LGRP_EXISTS(lgrp))
lgrp = lgrp_home_lgrp();
/* LINTED */
/*
* Only hold one freelist or cachelist lock at a time, that way we
* can start anywhere and not have to worry about lock
* ordering.
*/
n = 0;
m = mnoderangecnt - 1;
fullrange = 1;
} else {
/*
* We can guarantee alignment only for page boundary.
*/
return (NULL);
n = pfn_2_mtype(pfnlo);
m = pfn_2_mtype(pfnhi);
}
if (n > m)
return (NULL);
szc = 0;
/* cylcing thru mtype handled by RANGE0 if n == 0 */
if (n == 0) {
flags |= PGI_MT_RANGE0;
n = m;
}
/*
* Try local memory node first, but try remote if we can't
* get a page of the right color.
*/
/*
* allocate pages from high pfn to low.
*/
if (fullrange != 0) {
}
} else {
}
return (pp);
}
}
if (!local_failed_stat) {
local_failed_stat = 1;
}
}
return (NULL);
}
/*
* page_create_io()
*
* This function is a copy of page_create_va() with an additional
* argument 'mattr' that specifies DMA memory requirements to
* the page list functions. This function is used by the segkmem
* allocator so it is only to create new pages (i.e PG_EXCL is
* set).
*
* Note: This interface is currently used by x86 PSM only and is
* not fully specified so the commitment level is only for
* private interface specific to x86. This interface uses PSM
* specific page_get_anylist() interface.
*/
break; \
} \
}
page_t *
{
"page_create_start:vp %p off %llx bytes %u flags %x",
/*
* Do the freemem and pcf accounting.
*/
return (NULL);
}
/*
* If satisfying this request has left us with too little
* memory, start the wheels turning to get some back. The
* first clause of the test prevents waking up the pageout
* daemon in situations where it would decide that there's
* nothing to do.
*/
"pageout_cv_signal:freemem %ld", freemem);
}
if (flags & PG_PHYSCONTIG) {
return (NULL);
}
do {
panic("pg_creat_io: hashin failed %p %p %llx",
}
off += MMU_PAGESIZE;
PP_CLRFREE(pp);
PP_CLRAGED(pp);
if (!npages) {
return (plist);
} else {
}
/*
* fall-thru:
*
* page_get_contigpage returns when npages <= sgllen.
* Grab the rest of the non-contig pages below from anylist.
*/
}
/*
* Loop around collecting the requested number of pages.
* Most of the time, we have to `create' a new page. With
* this in mind, pull the page off the free list before
* getting the hash lock. This will minimize the hash
* lock hold time, nesting, and the like. If it turns
* out we don't need the page, we put it back at the end.
*/
while (npages--) {
top:
/*
* Try to get the page of any color either from
* the freelist or from the cache list.
*/
/*
* Not looking for a special page;
* panic!
*/
}
/*
* No page found! This can happen
* if we are looking for a page
* within a specific memory range
* for DMA purposes. If PG_WAIT is
* specified then we wait for a
* while and then try again. The
* wait could be forever if we
* don't get the page(s) we need.
*
* Note: XXX We really need a mechanism
* to wait for pages in the desired
* range. For now, we wait for any
* pages and see if we can use it.
*/
delay(10);
goto top;
}
goto fail; /* undo accounting stuff */
}
/*
* Since this page came from the
* cachelist, we must destroy the
* old vnode association.
*/
}
}
/*
* We own this page!
*/
/*
* Here we have a page in our hot little mits and are
* just waiting to stuff it on the appropriate lists.
* Get the mutex and check to see if it really does
* not exist.
*/
/*
* Since we hold the page hash mutex and
* just searched for this page, page_hashin
* had better not fail. If it does, that
* means somethread did not follow the
* page hash mutex rules. Panic now and
* get it over with. As usual, go down
* holding all the locks.
*/
panic("page_create: hashin fail %p %p %llx %p",
}
/*
* Hat layer locking need not be done to set
* the following bits since the page is not hashed
* and was on the free list (i.e., had no mappings).
*
* Set the reference bit to protect
* against immediate pageout
*
* XXXmh modify freelist code to set reference
* bit so we don't have to do it here.
*/
} else {
/*
* NOTE: This should not happen for pages associated
* with kernel vnode 'kvp'.
*/
/* XX64 - to debug why this happens! */
"page_create: page not expected "
"in hash list for kernel vnode - pp 0x%p",
(void *)pp);
goto fail;
}
/*
* Got a page! It is locked. Acquire the i/o
* lock since we are going to use the p_next and
* p_prev fields to link the requested pages together.
*/
off += MMU_PAGESIZE;
vaddr += MMU_PAGESIZE;
}
return (plist);
fail:
/*
* Did not need this page after all.
* Put it back on the free list.
*/
}
/*
* Give up the pages we already got.
*/
plist_len++;
/*LINTED: constant in conditional ctx*/
}
/*
* VN_DISPOSE does freemem accounting for the pages in plist
* by calling page_free. So, we need to undo the pcf accounting
* for only the remaining pages.
*/
return (NULL);
}
#endif /* !__xpv */
/*
* Copy the data from the physical page represented by "frompp" to
* that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
* CPU->cpu_caddr2. It assumes that no one uses either map at interrupt
* level and no one sleeps with an active mapping there.
*
* this operation, hence it is up to the caller to update them appropriately.
*/
int
{
int ret = 1;
if (kpm_enable) {
} else {
/*
* disable pre-emption so that CPU can't change
*/
}
ret = 0;
goto faulted;
}
if (use_sse_pagecopy)
#ifdef __xpv
#else
#endif
else
no_fault();
if (!kpm_enable) {
#ifdef __xpv
/*
* We can't leave unused mappings laying about under the
* hypervisor, so blow them away.
*/
UVMF_INVLPG | UVMF_LOCAL) < 0)
panic("HYPERVISOR_update_va_mapping() failed");
UVMF_INVLPG | UVMF_LOCAL) < 0)
panic("HYPERVISOR_update_va_mapping() failed");
#endif
}
return (ret);
}
void
{
}
/*
* Zero the physical page from off to off + len given by pfn
* without changing the reference and modified bits of page.
*
* We use this using CPU private page address #2, see ppcopy() for more info.
* pfnzero() must not be called at interrupt level.
*/
void
{
} else {
}
if (use_sse_pagezero) {
#ifdef __xpv
/*
* zero a byte at a time until properly aligned for
* block_zero_no_xmm().
*/
/*
* Now use faster block_zero_no_xmm() for any range
* that is properly aligned and sized.
*/
if (len != 0) {
}
/*
* zero remainder with byte stores.
*/
while (rem-- > 0)
#else
#endif
} else {
}
#ifdef __xpv
/*
* On the hypervisor this page might get used for a page
* table before any intervening change to this mapping,
* so blow it away.
*/
UVMF_INVLPG) < 0)
panic("HYPERVISOR_update_va_mapping() failed");
#endif
}
}
/*
* Platform-dependent page scrub call.
*/
void
{
/*
* For now, we rely on the fact that pagezero() will
* always clear UEs.
*/
}
/*
* set up two private addresses for use on a given CPU for use in ppcopy()
*/
void
{
void *addr;
}
/*
* Undo setup_vaddr_for_ppcopy
*/
void
{
cpup->cpu_caddr2pte = 0;
cpup->cpu_caddr2 = 0;
cpup->cpu_caddr1pte = 0;
cpup->cpu_caddr1 = 0;
}
/*
* Create the pageout scanner thread. The thread has to
* start at procedure with process pp and priority pri.
*/
void
{
}
/*
* Function for flushing D-cache when performing module relocations
* to an alternate mapping. Unnecessary on Intel / AMD platforms.
*/
void
{}
exec_get_spslew(void)
{
return (0);
}
/*
* Allocate a memory page. The argument 'seed' can be any pseudo-random
* number to vary where the pages come from. This is quite a hacked up
* method -- it works for now, but really needs to be fixed up a bit.
*
* We currently use page_create_va() on the kvp with fake offsets,
* segments and virt address. This is pretty bogus, but was copied from the
* old hat_i86.c code. A better approach would be to specify either mnode
* random or mnode local and takes a page from whatever color has the MOST
* available - this would have a minimal impact on page coloring.
*/
page_t *
{
/*
* This code is gross, we really need a simpler page allocator.
*
* We need assign an offset for the page to call page_create_va().
* To avoid conflicts with other pages, we get creative with the offset.
* For 32 bits, we pick an offset > 4Gig
* For 64 bits, pick an offset somewhere in the VA hole.
*/
if (offset > kernelbase)
offset -= kernelbase;
offset <<= MMU_PAGESHIFT;
#if defined(__amd64)
#else
#endif
return (NULL);
#ifdef DEBUG
#endif
return (NULL);
return (pp);
}