seg_kmem.c revision e291592ab12a560fc73b0610963bb3fe66aab341
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#include <sys/sysmacros.h>
#include <sys/tuneable.h>
#include <sys/bootconf.h>
#include <vm/seg_kmem.h>
#include <vm/faultcode.h>
#include <sys/mem_cage.h>
#ifdef __sparc
#endif
/*
* seg_kmem is the primary kernel memory segment driver. It
* maps the kernel heap [kernelheap, ekernelheap), module text,
* and all memory which was allocated before the VM was initialized
* into kas.
*
* Pages which belong to seg_kmem are hashed into &kvp vnode at
* an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1.
* They must never be paged out since segkmem_fault() is a no-op to
* prevent recursive faults.
*
* Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on
* __x86 and are unlocked (p_sharelock == 0) on __sparc. Once __x86
* supports relocation the #ifdef kludges can be removed.
*
* seg_kmem pages may be subject to relocation by page_relocate(),
* provided that the HAT supports it; if this is so, segkmem_reloc
* will be set to a nonzero value. All boot time allocated memory as
* well as static memory is considered off limits to relocation.
* Pages are "relocatable" if p_state does not have P_NORELOC set, so
* we request P_NORELOC pages for memory that isn't safe to relocate.
*
* The kernel heap is logically divided up into four pieces:
*
* heap32_arena is for allocations that require 32-bit absolute
*
* heap_core is for allocations that require 2GB *relative*
* offsets; in other words all memory from heap_core is within
* 2GB of all other memory from the same arena. This is a requirement
* of the addressing modes of some processors in supervisor code.
*
* heap_arena is the general heap arena.
*
* static_arena is the static memory arena. Allocations from it
* are not subject to relocation so it is safe to use the memory
* physical address as well as the virtual address (e.g. the VA to
* PA translations are static). Caches may import from static_arena;
* all other static memory allocations should use static_alloc_arena.
*
* On some platforms which have limited virtual address space, seg_kmem
* may share [kernelheap, ekernelheap) with seg_kp; if this is so,
* segkp_bitmap is non-NULL, and each bit represents a page of virtual
* address space which is actually seg_kp mapped.
*/
char *kernelheap; /* start of primary kernel heap */
char *ekernelheap; /* end of primary kernel heap */
char *heap_core_base; /* start of core kernel heap arena */
char *heap_lp_base; /* start of kernel large page heap arena */
char *heap_lp_end; /* end of kernel large page heap arena */
int segkmem_reloc; /* enable/disable relocatable segkmem pages */
/*
* seg_kmem driver can map part of the kernel heap with large pages.
* Currently this functionality is implemented for sparc platforms only.
*
* The large page size "segkmem_lpsize" for kernel heap is selected in the
* pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to
* match segkmem_lpsize.
*
* At boot time we carve from kernel heap arena a range of virtual addresses
* that will be used for large page mappings. This range [heap_lp_base,
* heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also
* create "kmem_lp_arena" that caches memory already backed up by large
* pages. kmem_lp_arena imports virtual segments from heap_lp_arena.
*/
int segkmem_lpszc = 0;
static vmem_t *kmem_lp_arena;
static vmem_t *segkmem_ppa_arena;
static segkmem_lpcb_t segkmem_lpcb;
/*
* We use "segkmem_kmemlp_max" to limit the total amount of physical memory
* consumed by the large page heap. By default this parameter is set to 1/8 of
* indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem
* we allow for large page heap.
*/
static uint_t segkmem_kmemlp_pcnt;
/*
* Getting large pages for kernel heap could be problematic due to
* physical memory fragmentation. That's why we allow to preallocate
* "segkmem_kmemlp_min" bytes at boot time.
*/
static size_t segkmem_kmemlp_min;
/*
* Throttling is used to avoid expensive tries to allocate large pages
* for kernel heap when a lot of succesive attempts to do so fail.
*/
/*
* Freed pages accumulate on a garbage list until segkmem is ready,
* at which point we call segkmem_gc() to free it all.
*/
typedef struct segkmem_gc_list {
struct segkmem_gc_list *gc_next;
static segkmem_gc_list_t *segkmem_gc_list;
/*
* Allocations from the hat_memload arena add VM_MEMLOAD to their
* vmflags so that segkmem_xalloc() can inform the hat layer that it needs
* to take steps to prevent infinite recursion. HAT allocations also
* must be non-relocatable to prevent recursive page faults.
*/
static void *
{
}
/*
* Allocations from static_arena arena (or any other arena that uses
* segkmem_alloc_permanent()) require non-relocatable (permanently
* wired) memory pages, since these pages are referenced by physical
* as well as virtual address.
*/
void *
{
}
/*
* Initialize kernel heap boundaries.
*/
void
void *heap_start,
void *heap_end,
char *first_avail,
void *core_start,
void *core_end)
{
size_t heap_lp_size = 0;
#ifdef __sparc
#endif /* __sparc */
#ifdef __sparc
/*
* Bias heap_lp start address by kmem64_sz to reduce collisions
* in 4M kernel TSB between kmem64 area and heap_lp
*/
#endif /* __sparc */
/*
* If this platform has a 'core' heap area, then the space for
* overflow module text should be carved out of the end of that
* heap. Otherwise, it gets carved out of the general purpose
* heap.
*/
if (core_size > 0) {
}
#ifndef __sparc
else {
}
#endif
if (core_size > 0) {
} else {
}
/*
* reserve space for the large page heap. If large pages for kernel
* heap is enabled large page heap arean will be created later in the
* boot sequence in segkmem_heap_lp_init(). Otherwise the allocated
* range will be returned back to the heap_arena.
*/
if (heap_lp_size) {
}
/*
* Remove the already-spoken-for memory range [kernelheap, first_avail).
*/
#ifdef __sparc
/*
* Prom claims the physical and virtual resources used by panicbuf
* and inter_vec_table. So reserve space for panicbuf, intr_vec_table,
* reserved interrupt vector data structures from 32-bit heap.
*/
#else /* __sparc */
#endif /* __sparc */
/*
* Create a set of arenas for memory with static translations
* (e.g. VA -> PA translations cannot change). Since using
* kernel pages by physical address implies it isn't safe to
* walk across page boundaries, the static_arena quantum must
* be PAGESIZE. Any kmem caches that require static memory
* should source from static_arena, while direct allocations
* should only use static_alloc_arena.
*/
0, VM_SLEEP);
/*
* Create an arena for translation data (ptes, hmes, or hblks).
* We need an arena for this because hat_memload() is essential
*
* Note: any kmem cache that allocates from hat_memload_arena
* must be created as a KMC_NOHASH cache (i.e. no external slab
* and bufctl structures to allocate) so that slab creation doesn't
* require anything more than a single vmem_alloc().
*/
}
void
{
panic("boot_mapin: page_resv failed");
if (pfnum == PFN_INVALID)
continue;
/*
* must break up any large pages that may have constituent
* pages being utilized for BOP_ALLOC()'s before calling
* page_numtopp().The locking code (ie. page_reclaim())
* can't handle them
*/
panic("boot_alloc: pp is NULL or free");
/*
* If the cage is on but doesn't yet contain this page,
* mark it as non-relocatable.
*/
}
#if defined(__x86)
#else
#endif
}
}
/*
* Get pages from boot and hash them into the kernel's vp.
* Used after page structs have been allocated, but before segkmem is ready.
*/
void *
{
prom_panic("boot_alloc: attempt to allocate memory after "
"BOP_GONE");
#ifdef __sparc
panic("boot_alloc: bop_alloc_chunk failed");
#else
panic("boot_alloc: BOP_ALLOC failed");
#endif
return (addr);
}
static void
{
panic("segkmem_badop");
}
#define SEGKMEM_BADOP(t) (t(*)())segkmem_badop
/*ARGSUSED*/
static faultcode_t
{
panic("segkmem_fault: bad args");
/*
* If it is one of segkp pages, call segkp_fault.
*/
return (FC_NOSUPPORT);
switch (type) {
case F_SOFTLOCK: /* lock down already-loaded translations */
/*
* Hmm, no page. Does a kernel mapping
* exist for it?
*/
while (--pg >= 0) {
if (pp)
}
return (FC_NOMAP);
}
}
}
return (0);
case F_SOFTUNLOCK:
while (npages--) {
if (pp)
}
return (0);
default:
return (FC_NOSUPPORT);
}
/*NOTREACHED*/
}
static int
{
panic("segkmem_setprot: bad args");
/*
* If it is one of segkp pages, call segkp.
*/
if (prot == 0)
else
return (0);
}
/*
* This is a dummy segkmem function overloaded to call segkp
* when segkp is under the heap.
*/
/* ARGSUSED */
static int
{
/*
* If it is one of segkp pages, call into segkp.
*/
return (0);
}
/*
* This is a dummy segkmem function overloaded to call segkp
* when segkp is under the heap.
*/
/* ARGSUSED */
static int
{
/*
* If it is one of segkp pages, call into segkp.
*/
return (0);
}
static void
{
}
}
static void
{
/*
* If we are about to start dumping the range of addresses we
* carved out of the kernel heap for the large page heap walk
* heap_lp_arena to find what segments are actually populated
*/
if (SEGKMEM_USE_LARGEPAGES &&
} else {
}
}
static void
{
/*
* The kernel's heap_arena (represented by kvseg) is a very large
* VA space, most of which is typically unused. To speed up dumping
* we use vmem_walk() to quickly find the pieces of heap_arena that
* are actually in use. We do the same for heap32_arena and
* heap_core.
*
* We specify VMEM_REENTRANT to vmem_walk() because dump_addpage()
* may ultimately need to allocate memory. Reentrant walks are
* necessarily imperfect snapshots. The kernel heap continues
* to change during a live crash dump, for example. For a normal
* crash dump, however, we know that there won't be any other threads
* messing with the heap. Therefore, at worst, we may fail to dump
* the pages that get allocated by the act of dumping; but we will
* always dump every page that was allocated when the walk began.
*
* The other segkmem segments are dense (fully populated), so there's
* no need to use this technique when dumping them.
*
* Note: when adding special dump handling for any new sparsely-
* populated segments, be sure to add similar handling to the ::kgrep
* code in mdb.
*/
#ifndef __sparc
#endif
} else if (seg == &kvseg_core) {
/*
* We don't want to dump pages attached to kzioseg since they
* contain file data from ZFS. If this page's segment is
* kzioseg return instead of writing it to the dump device.
*/
return;
} else {
}
}
/*
* Returns a shadow list of pages in ppp. If there are holes
* in the range (e.g. some of the kernel mappings do not have
* underlying page_ts) returns ENOTSUP so that as_pagelock()
* will handle the range via as_fault(F_SOFTLOCK).
*/
/*ARGSUSED*/
static int
{
/*
* If it is one of segkp pages, call into segkp.
*/
if (type == L_PAGEUNLOCK) {
}
return (0);
}
return (ENOTSUP); /* take the slow path */
}
while (--pg >= 0)
return (ENOTSUP);
}
}
return (0);
}
/*
* This is a dummy segkmem function overloaded to call segkp
* when segkp is under the heap.
*/
/* ARGSUSED */
static int
{
/*
* If it is one of segkp pages, call into segkp.
*/
return (0);
}
/*ARGSUSED*/
static lgrp_mem_policy_info_t *
{
return (NULL);
}
/*ARGSUSED*/
static int
{
if (capability == S_CAPABILITY_NOMINFLT)
return (1);
return (0);
}
static struct seg_ops segkmem_ops = {
SEGKMEM_BADOP(int), /* dup */
SEGKMEM_BADOP(int), /* unmap */
SEGKMEM_BADOP(void), /* free */
SEGKMEM_BADOP(int), /* sync */
SEGKMEM_BADOP(int), /* lockop */
SEGKMEM_BADOP(int), /* getprot */
SEGKMEM_BADOP(int), /* gettype */
SEGKMEM_BADOP(int), /* getvp */
SEGKMEM_BADOP(int), /* advise */
SEGKMEM_BADOP(int), /* setpgsz */
segkmem_getpolicy, /* getpolicy */
segkmem_capable, /* capable */
};
int
{
return (0);
}
int
{
return (0);
}
/*ARGSUSED*/
page_t *
{
int pgflags;
pgflags |= PG_NORELOC;
if ((vmflag & VM_NOSLEEP) == 0)
if (vmflag & VM_PUSHPAGE)
pgflags |= PG_PUSHPAGE;
if (vmflag & VM_NORMALPRI) {
pgflags |= PG_NORMALPRI;
}
}
/*
* Allocate pages to back the virtual address range [addr, addr + size).
* If addr is NULL, allocate the virtual address space as well.
*/
void *
{
int allocflag;
return (NULL);
return (NULL);
}
return (NULL);
}
/*
* Under certain conditions, we need to let the HAT layer know
* that it cannot safely allocate memory. Allocations from
* the hat_memload vmem arena always need this, to prevent
* infinite recursion.
*
* In addition, the x86 hat cannot safely do memory
* allocations while in vmem_populate(), because there
* is no simple bound on its usage.
*/
if (vmflag & VM_MEMLOAD)
#if defined(__x86)
else if (vmem_is_populator())
#endif
else
allocflag = 0;
#if defined(__x86)
#else
if (vmflag & SEGKMEM_SHARELOCKED)
else
#endif
}
return (addr);
}
static void *
{
void *addr;
#ifndef __sparc
halt("Memory allocation between bop_alloc() and "
"kmem_alloc().\n");
#endif
/*
* There's not a lot of memory to go around during boot,
* so recycle it if we can.
*/
return (gcp);
}
}
panic("segkmem_alloc: boot_alloc failed");
return (addr);
}
}
void *
{
}
void *
{
}
/*
* Any changes to this routine must also be carried over to
* devmap_free_pages() in the seg_dev driver. This is because
* we currently don't have a special kernel segment for non-paged
* kernel memory that is exported by drivers to user space.
*/
static void
{
return;
}
#if defined(__x86)
panic("segkmem_free: page not found");
if (!page_tryupgrade(pp)) {
/*
* Some other thread has a sharelock. Wait for
* it to drop the lock so we can free this page.
*/
SE_EXCL);
}
#else
#endif
panic("segkmem_free: page not found");
/* Clear p_lckcnt so page_destroy() doesn't update availrmem */
if (func)
else
page_destroy(pp, 0);
}
}
void
{
}
void
{
}
void
{
}
void
segkmem_gc(void)
{
while (segkmem_gc_list != NULL) {
}
}
/*
* Legacy entry points from here to end of file.
*/
void
{
flags | HAT_LOAD_LOCK);
}
void
{
}
void *
{
}
void
{
}
/*
* segkmem_page_create_large() allocates a large page to be used for the kmem
* caches. If kpr is enabled we ask for a relocatable page unless requested
* otherwise. If kpr is disabled we have to ask for a non-reloc page
*/
static page_t *
{
int pgflags;
pgflags |= PG_NORELOC;
if (!(vmflag & VM_NOSLEEP))
if (vmflag & VM_PUSHPAGE)
pgflags |= PG_PUSHPAGE;
if (vmflag & VM_NORMALPRI)
pgflags |= PG_NORMALPRI;
}
/*
* Allocate a large page to back the virtual address range
* [addr, addr + size). If addr is NULL, allocate the virtual address
* space as well.
*/
static void *
void *pcarg)
{
int i;
vmflag |= VM_NOSLEEP;
return (NULL);
}
/*
* allocate an array we need for hat_memload_array.
* we use a separate arena to avoid recursion.
* we will not need this array when hat_memload_array learns pp++
*/
goto fail_array_alloc;
}
goto fail_vmem_alloc;
/* create all the pages */
goto fail_page_create;
}
/* at this point we have all the resource to complete the request */
for (i = 0; i < nbpages; i++) {
}
/*
* Load the locked entry. It's OK to preload the entry into the
* TSB since we now support large mappings in the kernel TSB.
*/
for (--i; i >= 0; --i) {
page_unlock(ppa[i]);
}
}
return (addr);
}
}
return (NULL);
}
static void
{
panic("segkmem_free_one_lp: page not found");
}
/* page_unresv() is done by the caller */
}
/*
* This function is called to import new spans into the vmem arenas like
* kmem_default_arena and kmem_oversize_arena. It first tries to import
* spans from large page arena - kmem_lp_arena. In order to do this it might
* have to "upgrade the requested size" to kmem_lp_arena quantum. If
* it was not able to satisfy the upgraded request it then calls regular
* segkmem_alloc() that satisfies the request by importing from "*vmp" arena
*/
/*ARGSUSED*/
void *
{
!(vmflag & SEGKMEM_SHARELOCKED)) {
int dowakeup = 0;
int doalloc = 1;
if (lpthrt != 0) {
/* try to update the throttle value */
if (lpthrt >= segkmem_lpthrottle_max) {
segkmem_lpthrottle_max / 4);
}
/*
* when we get above throttle start do an exponential
* backoff at trying large pages and reaping
*/
if (lpthrt > segkmem_lpthrottle_start &&
lpcb->allocs_throttled++;
lpthrt--;
kmem_reap();
}
}
if (!(vmflag & VM_NOSLEEP) &&
/*
* we are low on free memory in kmem_lp_arena
* we let only one guy to allocate heap_lp
* quantum size chunk that everybody is going to
* share
*/
/* we are not the first one - wait */
kmemlp_qnt) {
doalloc = 0;
}
kmemlp_qnt) {
/*
* we are the first one, make sure we import
* a large page
*/
if (asize == kmemlp_qnt)
asize += kmemlp_qnt;
dowakeup = 1;
}
}
/*
* VM_ABORT flag prevents sleeps in vmem_xalloc when
* large pages are not available. In that case this allocation
* attempt will fail and we will retry allocation with small
* pages. We also do not want to panic if this allocation fails
* because we are going to retry.
*/
if (doalloc) {
if (dowakeup) {
}
}
*lpthrtp = 0;
return (addr);
}
if (vmflag & VM_NOSLEEP)
else
/* if large page throttling is not started yet do it */
if (segkmem_use_lpthrottle && lpthrt == 0) {
}
}
}
void
{
} else {
}
}
/*
* segkmem_alloc_lpi() imports virtual memory from large page heap arena
* into kmem_lp arena. In the process it maps the imported segment with
* large pages
*/
static void *
{
void *addr;
/* do not allow large page heap grow beyound limits */
lpcb->allocs_limited++;
return (NULL);
}
return (addr);
}
/*
* segkmem_free_lpi() returns virtual memory back into large page heap arena
* from kmem_lp arena. Beore doing this it unmaps the segment and frees
* large pages used to map it.
*/
static void
{
int i;
for (i = 0; i < nlpages; i++) {
}
}
/*
* This function is called at system boot time by kmem_init right after
* initialiazation necessary to actually start using large pages
* happens later in the process after segkmem_heap_lp_init() is called.
*/
int
{
int use_large_pages = 0;
#ifdef __sparc
if (heap_lp_base == NULL) {
return (0);
}
/* get a platform dependent value of large page size for kernel heap */
if (segkmem_lpsize <= PAGESIZE) {
/*
* put virtual space reserved for the large page kernel
* back to the regular heap
*/
heap_lp_base = NULL;
heap_lp_end = NULL;
return (0);
}
/* set heap_lp quantum if necessary */
if (segkmem_heaplp_quantum == 0 ||
}
/* set kmem_lp quantum if necessary */
if (segkmem_kmemlp_quantum == 0 ||
}
/* set total amount of memory allowed for large page kernel heap */
if (segkmem_kmemlp_max == 0) {
segkmem_kmemlp_pcnt = 12;
}
/* fix lp kmem preallocation request if necesssary */
if (segkmem_kmemlp_min) {
}
use_large_pages = 1;
#endif
return (use_large_pages);
}
void
{
ASSERT(zio_mem_size != 0);
/*
* To reduce VA space fragmentation, we set up quantum caches for the
* smaller sizes; we chose 32k because that translates to 128k VA
* slabs, which matches nicely with the common 128k zio_data bufs.
*/
}
#ifdef __sparc
static void *
{
void *addr;
if (ppaquantum <= PAGESIZE)
}
return (addr);
}
static void
{
if (ppaquantum <= PAGESIZE) {
} else {
}
}
void
{
void *addr;
if (segkmem_lpsize <= PAGESIZE) {
return;
}
/* create large page heap arena */
/* This arena caches memory already mapped by large pages */
/*
* this arena is used for the array of page_t pointers necessary
* to call hat_mem_load_array
*/
VM_SLEEP);
/* prealloacate some memory for the lp kernel heap */
if (segkmem_kmemlp_min) {
segkmem_heaplp_quantum) == 0);
}
}
}
#endif