vm_pagelist.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* Portions of this source code were derived from Berkeley 4.3 BSD
* under license from the Regents of the University of California.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* This file contains common functions to access and manage the page lists.
* Many of these routines originated from platform dependent modules
* (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
* a platform independent manner.
*
* vm/vm_dep.h provides for platform specific support.
*/
#include <sys/types.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/sysmacros.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/seg_kmem.h>
#include <vm/seg_vn.h>
#include <sys/memnode.h>
#include <vm/vm_dep.h>
#include <sys/lgrp.h>
#include <sys/mem_config.h>
#include <sys/callb.h>
#include <sys/mem_cage.h>
#include <sys/sdt.h>
extern uint_t vac_colors;
/*
* number of page colors equivalent to reqested color in page_get routines.
* If set, keeps large pages intact longer and keeps MPO allocation
* from the local mnode in favor of acquiring the 'correct' page color from
* a demoted large page or from a remote mnode.
*/
int colorequiv;
/*
* if set, specifies the percentage of large pages that are free from within
* a large page region before attempting to lock those pages for
* page_get_contig_pages processing.
*
* Should be turned on when kpr is available when page_trylock_contig_pages
* can be more selective.
*/
int ptcpthreshold;
/*
* Limit page get contig page search based on failure cnts in pgcpfailcnt[].
* use slot 0 (base page size unused) to enable or disable limiting search.
* Enabled by default.
*/
int pgcpfailcnt[MMU_PAGE_SIZES];
int pgcplimitsearch = 1;
#ifdef VM_STATS
struct vmm_vmstats_str vmm_vmstats;
#endif /* VM_STATS */
#if defined(__sparc)
#define LPGCREATE 0
#else
/* enable page_get_contig_pages */
#define LPGCREATE 1
#endif
int pg_contig_disable;
int pg_lpgcreate_nocage = LPGCREATE;
/*
* page_freelist_fill pfn flag to signify no hi pfn requirement.
*/
#define PFNNULL 0
/* Flags involved in promotion and demotion routines */
#define PC_FREE 0x1 /* put page on freelist */
#define PC_ALLOC 0x2 /* return page for allocation */
/*
* Flag for page_demote to be used with PC_FREE to denote that we don't care
* what the color is as the color parameter to the function is ignored.
*/
#define PC_NO_COLOR (-1)
/*
* page counters candidates info
* See page_ctrs_cands comment below for more details.
* fields are as follows:
* pcc_pages_free: # pages which freelist coalesce can create
* pcc_color_free_len: number of elements in pcc_color_free array
* pcc_color_free: pointer to page free counts per color
*/
typedef struct pcc_info {
pgcnt_t pcc_pages_free;
int pcc_color_free_len;
pgcnt_t *pcc_color_free;
} pcc_info_t;
/*
* On big machines it can take a long time to check page_counters
* arrays. page_ctrs_cands is a summary array whose elements are a dynamically
* updated sum of all elements of the corresponding page_counters arrays.
* page_freelist_coalesce() searches page_counters only if an appropriate
* element of page_ctrs_cands array is greater than 0.
*
* An extra dimension is used for page_ctrs_cands to spread the elements
* over a few e$ cache lines to avoid serialization during the array
* updates.
*/
#pragma align 64(page_ctrs_cands)
static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
/*
* Return in val the total number of free pages which can be created
* for the given mnode (m) and region size (r)
*/
#define PGCTRS_CANDS_GETVALUE(m, r, val) { \
int i; \
val = 0; \
for (i = 0; i < NPC_MUTEX; i++) { \
val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \
} \
}
/*
* Return in val the total number of free pages which can be created
* for the given mnode (m), region size (r), and color (c)
*/
#define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \
int i; \
val = 0; \
ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \
for (i = 0; i < NPC_MUTEX; i++) { \
val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \
} \
}
/*
* We can only allow a single thread to update a counter within the physical
* range of the largest supported page size. That is the finest granularity
* possible since the counter values are dependent on each other
* as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
* ctr_mutex lock index for a particular physical range.
*/
static kmutex_t *ctr_mutex[NPC_MUTEX];
#define PP_CTR_LOCK_INDX(pp) \
(((pp)->p_pagenum >> \
(PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
/*
* Local functions prototypes.
*/
void page_ctr_add(page_t *, int);
void page_ctr_add_internal(int, page_t *, int);
void page_ctr_sub(page_t *, int);
uint_t page_convert_color(uchar_t, uchar_t, uint_t);
void page_freelist_lock(int);
void page_freelist_unlock(int);
page_t *page_promote(int, pfn_t, uchar_t, int);
page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
static int page_trylock_cons(page_t *pp, se_t se);
#define PNUM_SIZE(szc) \
(hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
#define PNUM_SHIFT(szc) \
(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
/*
* The page_counters array below is used to keep track of free contiguous
* physical memory. A hw_page_map_t will be allocated per mnode per szc.
* This contains an array of counters, the size of the array, a shift value
* used to convert a pagenum into a counter array index or vice versa, as
* well as a cache of the last successful index to be promoted to a larger
* page size. As an optimization, we keep track of the last successful index
* to be promoted per page color for the given size region, and this is
* allocated dynamically based upon the number of colors for a given
* region size.
*
* Conceptually, the page counters are represented as:
*
* page_counters[region_size][mnode]
*
* region_size: size code of a candidate larger page made up
* of contiguous free smaller pages.
*
* page_counters[region_size][mnode].hpm_counters[index]:
* represents how many (region_size - 1) pages either
* exist or can be created within the given index range.
*
* Let's look at a sparc example:
* If we want to create a free 512k page, we look at region_size 2
* for the mnode we want. We calculate the index and look at a specific
* hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
* this location, it means that 8 64k pages either exist or can be created
* from 8K pages in order to make a single free 512k page at the given
* index. Note that when a region is full, it will contribute to the
* counts in the region above it. Thus we will not know what page
* size the free pages will be which can be promoted to this new free
* page unless we look at all regions below the current region.
*/
/*
* Note: hpmctr_t is defined in platform vm_dep.h
* hw_page_map_t contains all the information needed for the page_counters
* logic. The fields are as follows:
*
* hpm_counters: dynamically allocated array to hold counter data
* hpm_entries: entries in hpm_counters
* hpm_shift: shift for pnum/array index conv
* hpm_base: PFN mapped to counter index 0
* hpm_color_current_len: # of elements in hpm_color_current "array" below
* hpm_color_current: last index in counter array for this color at
* which we successfully created a large page
*/
typedef struct hw_page_map {
hpmctr_t *hpm_counters;
size_t hpm_entries;
int hpm_shift;
pfn_t hpm_base;
size_t hpm_color_current_len;
size_t *hpm_color_current;
} hw_page_map_t;
/*
* Element zero is not used, but is allocated for convenience.
*/
static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
/*
* The following macros are convenient ways to get access to the individual
* elements of the page_counters arrays. They can be used on both
* the left side and right side of equations.
*/
#define PAGE_COUNTERS(mnode, rg_szc, idx) \
(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
#define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
(page_counters[(rg_szc)][(mnode)].hpm_counters)
#define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
(page_counters[(rg_szc)][(mnode)].hpm_shift)
#define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
(page_counters[(rg_szc)][(mnode)].hpm_entries)
#define PAGE_COUNTERS_BASE(mnode, rg_szc) \
(page_counters[(rg_szc)][(mnode)].hpm_base)
#define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \
(page_counters[(rg_szc)][(mnode)].hpm_color_current_len)
#define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \
(page_counters[(rg_szc)][(mnode)].hpm_color_current)
#define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \
(page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])
#define PNUM_TO_IDX(mnode, rg_szc, pnum) \
(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
#define IDX_TO_PNUM(mnode, rg_szc, index) \
(PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
/*
* Protects the hpm_counters and hpm_color_current memory from changing while
* looking at page counters information.
* Grab the write lock to modify what these fields point at.
* Grab the read lock to prevent any pointers from changing.
* The write lock can not be held during memory allocation due to a possible
* recursion deadlock with trying to grab the read lock while the
* write lock is already held.
*/
krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
/*
* page size to page size code
*/
int
page_szc(size_t pagesize)
{
int i = 0;
while (hw_page_array[i].hp_size) {
if (pagesize == hw_page_array[i].hp_size)
return (i);
i++;
}
return (-1);
}
/*
* page size to page size code for user supported page sizes
*/
int
page_user_szc(size_t pagesize)
{
int szc = page_szc(pagesize);
if (szc != -1)
return (SZC_2_USERSZC(szc));
return (-1);
}
/*
* Return how many page sizes are available for the user to use. This is
* what the hardware supports and not based upon how the OS implements the
* support of different page sizes.
*/
uint_t
page_num_user_pagesizes(void)
{
return (mmu_exported_page_sizes);
}
uint_t
page_num_pagesizes(void)
{
return (mmu_page_sizes);
}
/*
* returns the count of the number of base pagesize pages associated with szc
*/
pgcnt_t
page_get_pagecnt(uint_t szc)
{
if (szc >= mmu_page_sizes)
panic("page_get_pagecnt: out of range %d", szc);
return (hw_page_array[szc].hp_pgcnt);
}
size_t
page_get_pagesize(uint_t szc)
{
if (szc >= mmu_page_sizes)
panic("page_get_pagesize: out of range %d", szc);
return (hw_page_array[szc].hp_size);
}
/*
* Return the size of a page based upon the index passed in. An index of
* zero refers to the smallest page size in the system, and as index increases
* it refers to the next larger supported page size in the system.
* Note that szc and userszc may not be the same due to unsupported szc's on
* some systems.
*/
size_t
page_get_user_pagesize(uint_t userszc)
{
uint_t szc = USERSZC_2_SZC(userszc);
if (szc >= mmu_page_sizes)
panic("page_get_user_pagesize: out of range %d", szc);
return (hw_page_array[szc].hp_size);
}
uint_t
page_get_shift(uint_t szc)
{
if (szc >= mmu_page_sizes)
panic("page_get_shift: out of range %d", szc);
return (hw_page_array[szc].hp_shift);
}
uint_t
page_get_pagecolors(uint_t szc)
{
ASSERT(page_colors != 0);
return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
}
/*
* Called by startup().
* Size up the per page size free list counters based on physmax
* of each node and max_mem_nodes.
*/
size_t
page_ctrs_sz(void)
{
int r; /* region size */
int mnode;
uint_t ctrs_sz = 0;
int i;
pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
/*
* We need to determine how many page colors there are for each
* page size in order to allocate memory for any color specific
* arrays.
*/
colors_per_szc[0] = page_colors;
for (i = 1; i < mmu_page_sizes; i++) {
colors_per_szc[i] =
page_convert_color(0, i, page_colors - 1) + 1;
}
for (mnode = 0; mnode < max_mem_nodes; mnode++) {
pgcnt_t r_pgcnt;
pfn_t r_base;
pgcnt_t r_align;
if (mem_node_config[mnode].exists == 0)
continue;
/*
* determine size needed for page counter arrays with
* base aligned to large page size.
*/
for (r = 1; r < mmu_page_sizes; r++) {
/* add in space for hpm_counters */
r_align = page_get_pagecnt(r);
r_base = mem_node_config[mnode].physbase;
r_base &= ~(r_align - 1);
r_pgcnt = howmany(mem_node_config[mnode].physmax -
r_base, r_align);
/*
* Round up to always allocate on pointer sized
* boundaries.
*/
ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
sizeof (hpmctr_t *));
/* add in space for hpm_color_current */
ctrs_sz += (colors_per_szc[r] *
sizeof (size_t));
}
}
for (r = 1; r < mmu_page_sizes; r++) {
ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
/* add in space for page_ctrs_cands */
ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
sizeof (pgcnt_t);
}
/* ctr_mutex */
ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
/* size for page list counts */
PLCNT_SZ(ctrs_sz);
/*
* add some slop for roundups. page_ctrs_alloc will roundup the start
* address of the counters to ecache_alignsize boundary for every
* memory node.
*/
return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
}
caddr_t
page_ctrs_alloc(caddr_t alloc_base)
{
int mnode;
int r; /* region size */
int i;
pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
/*
* We need to determine how many page colors there are for each
* page size in order to allocate memory for any color specific
* arrays.
*/
colors_per_szc[0] = page_colors;
for (i = 1; i < mmu_page_sizes; i++) {
colors_per_szc[i] =
page_convert_color(0, i, page_colors - 1) + 1;
}
for (r = 1; r < mmu_page_sizes; r++) {
page_counters[r] = (hw_page_map_t *)alloc_base;
alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
}
/* page_ctrs_cands */
for (r = 1; r < mmu_page_sizes; r++) {
for (i = 0; i < NPC_MUTEX; i++) {
page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
alloc_base += max_mem_nodes * (sizeof (pcc_info_t));
}
}
/* page_ctrs_cands pcc_color_free array */
for (r = 1; r < mmu_page_sizes; r++) {
for (i = 0; i < NPC_MUTEX; i++) {
for (mnode = 0; mnode < max_mem_nodes; mnode++) {
page_ctrs_cands[i][r][mnode].pcc_color_free_len
= colors_per_szc[r];
page_ctrs_cands[i][r][mnode].pcc_color_free =
(pgcnt_t *)alloc_base;
alloc_base += colors_per_szc[r] *
sizeof (pgcnt_t);
}
}
}
/* ctr_mutex */
for (i = 0; i < NPC_MUTEX; i++) {
ctr_mutex[i] = (kmutex_t *)alloc_base;
alloc_base += (max_mem_nodes * sizeof (kmutex_t));
}
/* initialize page list counts */
PLCNT_INIT(alloc_base);
for (mnode = 0; mnode < max_mem_nodes; mnode++) {
pgcnt_t r_pgcnt;
pfn_t r_base;
pgcnt_t r_align;
int r_shift;
if (mem_node_config[mnode].exists == 0)
continue;
for (r = 1; r < mmu_page_sizes; r++) {
/*
* the page_counters base has to be aligned to the
* page count of page size code r otherwise the counts
* will cross large page boundaries.
*/
r_align = page_get_pagecnt(r);
r_base = mem_node_config[mnode].physbase;
/* base needs to be aligned - lower to aligned value */
r_base &= ~(r_align - 1);
r_pgcnt = howmany(mem_node_config[mnode].physmax -
r_base, r_align);
r_shift = PAGE_BSZS_SHIFT(r);
PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
PAGE_COUNTERS_BASE(mnode, r) = r_base;
PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
colors_per_szc[r];
PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
(size_t *)alloc_base;
alloc_base += (sizeof (size_t) * colors_per_szc[r]);
for (i = 0; i < colors_per_szc[r]; i++) {
PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
}
PAGE_COUNTERS_COUNTERS(mnode, r) =
(hpmctr_t *)alloc_base;
/*
* Round up to make alloc_base always be aligned on
* a pointer boundary.
*/
alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
sizeof (hpmctr_t *));
/*
* Verify that PNUM_TO_IDX and IDX_TO_PNUM
* satisfy the identity requirement.
* We should be able to go from one to the other
* and get consistent values.
*/
ASSERT(PNUM_TO_IDX(mnode, r,
(IDX_TO_PNUM(mnode, r, 0))) == 0);
ASSERT(IDX_TO_PNUM(mnode, r,
(PNUM_TO_IDX(mnode, r, r_base))) == r_base);
}
/*
* Roundup the start address of the page_counters to
* cache aligned boundary for every memory node.
* page_ctrs_sz() has added some slop for these roundups.
*/
alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
L2CACHE_ALIGN);
}
/* Initialize other page counter specific data structures. */
for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
}
return (alloc_base);
}
/*
* Functions to adjust region counters for each size free list.
* Caller is responsible to acquire the ctr_mutex lock if necessary and
* thus can be called during startup without locks.
*/
/* ARGSUSED */
void
page_ctr_add_internal(int mnode, page_t *pp, int flags)
{
ssize_t r; /* region size */
ssize_t idx;
pfn_t pfnum;
int lckidx;
ASSERT(pp->p_szc < mmu_page_sizes);
PLCNT_INCR(pp, mnode, pp->p_szc, flags);
/* no counter update needed for largest page size */
if (pp->p_szc >= mmu_page_sizes - 1) {
return;
}
r = pp->p_szc + 1;
pfnum = pp->p_pagenum;
lckidx = PP_CTR_LOCK_INDX(pp);
/*
* Increment the count of free pages for the current
* region. Continue looping up in region size incrementing
* count if the preceeding region is full.
*/
while (r < mmu_page_sizes) {
idx = PNUM_TO_IDX(mnode, r, pfnum);
ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
break;
page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
page_ctrs_cands[lckidx][r][mnode].
pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
r++;
}
}
void
page_ctr_add(page_t *pp, int flags)
{
int lckidx = PP_CTR_LOCK_INDX(pp);
int mnode = PP_2_MEM_NODE(pp);
kmutex_t *lock = &ctr_mutex[lckidx][mnode];
mutex_enter(lock);
page_ctr_add_internal(mnode, pp, flags);
mutex_exit(lock);
}
void
page_ctr_sub(page_t *pp, int flags)
{
int lckidx;
int mnode = PP_2_MEM_NODE(pp);
kmutex_t *lock;
ssize_t r; /* region size */
ssize_t idx;
pfn_t pfnum;
ASSERT(pp->p_szc < mmu_page_sizes);
PLCNT_DECR(pp, mnode, pp->p_szc, flags);
/* no counter update needed for largest page size */
if (pp->p_szc >= mmu_page_sizes - 1) {
return;
}
r = pp->p_szc + 1;
pfnum = pp->p_pagenum;
lckidx = PP_CTR_LOCK_INDX(pp);
lock = &ctr_mutex[lckidx][mnode];
/*
* Decrement the count of free pages for the current
* region. Continue looping up in region size decrementing
* count if the preceeding region was full.
*/
mutex_enter(lock);
while (r < mmu_page_sizes) {
idx = PNUM_TO_IDX(mnode, r, pfnum);
ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
break;
}
ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
ASSERT(page_ctrs_cands[lckidx][r][mnode].
pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
page_ctrs_cands[lckidx][r][mnode].
pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
r++;
}
mutex_exit(lock);
}
/*
* Adjust page counters following a memory attach, since typically the
* size of the array needs to change, and the PFN to counter index
* mapping needs to change.
*/
uint_t
page_ctrs_adjust(int mnode)
{
pgcnt_t npgs;
int r; /* region size */
int i;
size_t pcsz, old_csz;
hpmctr_t *new_ctr, *old_ctr;
pfn_t oldbase, newbase;
size_t old_npgs;
hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
size_t size_cache[MMU_PAGE_SIZES];
size_t *color_cache[MMU_PAGE_SIZES];
size_t *old_color_array;
pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
npgs = roundup(mem_node_config[mnode].physmax,
PC_BASE_ALIGN) - newbase;
/*
* We need to determine how many page colors there are for each
* page size in order to allocate memory for any color specific
* arrays.
*/
colors_per_szc[0] = page_colors;
for (r = 1; r < mmu_page_sizes; r++) {
colors_per_szc[r] =
page_convert_color(0, r, page_colors - 1) + 1;
}
/*
* Preallocate all of the new hpm_counters arrays as we can't
* hold the page_ctrs_rwlock as a writer and allocate memory.
* If we can't allocate all of the arrays, undo our work so far
* and return failure.
*/
for (r = 1; r < mmu_page_sizes; r++) {
pcsz = npgs >> PAGE_BSZS_SHIFT(r);
ctr_cache[r] = kmem_zalloc(pcsz *
sizeof (hpmctr_t), KM_NOSLEEP);
if (ctr_cache[r] == NULL) {
while (--r >= 1) {
kmem_free(ctr_cache[r],
size_cache[r] * sizeof (hpmctr_t));
}
return (ENOMEM);
}
size_cache[r] = pcsz;
}
/*
* Preallocate all of the new color current arrays as we can't
* hold the page_ctrs_rwlock as a writer and allocate memory.
* If we can't allocate all of the arrays, undo our work so far
* and return failure.
*/
for (r = 1; r < mmu_page_sizes; r++) {
color_cache[r] = kmem_zalloc(sizeof (size_t) *
colors_per_szc[r], KM_NOSLEEP);
if (color_cache[r] == NULL) {
while (--r >= 1) {
kmem_free(color_cache[r],
colors_per_szc[r] * sizeof (size_t));
}
for (r = 1; r < mmu_page_sizes; r++) {
kmem_free(ctr_cache[r],
size_cache[r] * sizeof (hpmctr_t));
}
return (ENOMEM);
}
}
/*
* Grab the write lock to prevent others from walking these arrays
* while we are modifying them.
*/
rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
page_freelist_lock(mnode);
for (r = 1; r < mmu_page_sizes; r++) {
PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
oldbase = PAGE_COUNTERS_BASE(mnode, r);
old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);
pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
new_ctr = ctr_cache[r];
ctr_cache[r] = NULL;
if (old_ctr != NULL &&
(oldbase + old_npgs > newbase) &&
(newbase + npgs > oldbase)) {
/*
* Map the intersection of the old and new
* counters into the new array.
*/
size_t offset;
if (newbase > oldbase) {
offset = (newbase - oldbase) >>
PAGE_COUNTERS_SHIFT(mnode, r);
bcopy(old_ctr + offset, new_ctr,
MIN(pcsz, (old_csz - offset)) *
sizeof (hpmctr_t));
} else {
offset = (oldbase - newbase) >>
PAGE_COUNTERS_SHIFT(mnode, r);
bcopy(old_ctr, new_ctr + offset,
MIN(pcsz - offset, old_csz) *
sizeof (hpmctr_t));
}
}
PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
PAGE_COUNTERS_BASE(mnode, r) = newbase;
PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
color_cache[r] = NULL;
/*
* for now, just reset on these events as it's probably
* not worthwhile to try and optimize this.
*/
for (i = 0; i < colors_per_szc[r]; i++) {
PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
}
/* cache info for freeing out of the critical path */
if ((caddr_t)old_ctr >= kernelheap &&
(caddr_t)old_ctr < ekernelheap) {
ctr_cache[r] = old_ctr;
size_cache[r] = old_csz;
}
if ((caddr_t)old_color_array >= kernelheap &&
(caddr_t)old_color_array < ekernelheap) {
color_cache[r] = old_color_array;
}
/*
* Verify that PNUM_TO_IDX and IDX_TO_PNUM
* satisfy the identity requirement.
* We should be able to go from one to the other
* and get consistent values.
*/
ASSERT(PNUM_TO_IDX(mnode, r,
(IDX_TO_PNUM(mnode, r, 0))) == 0);
ASSERT(IDX_TO_PNUM(mnode, r,
(PNUM_TO_IDX(mnode, r, newbase))) == newbase);
}
page_freelist_unlock(mnode);
rw_exit(&page_ctrs_rwlock[mnode]);
/*
* Now that we have dropped the write lock, it is safe to free all
* of the memory we have cached above.
*/
for (r = 1; r < mmu_page_sizes; r++) {
if (ctr_cache[r] != NULL) {
kmem_free(ctr_cache[r],
size_cache[r] * sizeof (hpmctr_t));
}
if (color_cache[r] != NULL) {
kmem_free(color_cache[r],
colors_per_szc[r] * sizeof (size_t));
}
}
return (0);
}
/*
* color contains a valid color index or bin for cur_szc
*/
uint_t
page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
{
uint_t shift;
if (cur_szc > new_szc) {
shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
return (color << shift);
} else if (cur_szc < new_szc) {
shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
return (color >> shift);
}
return (color);
}
#ifdef DEBUG
/*
* confirm pp is a large page corresponding to szc
*/
void
chk_lpg(page_t *pp, uchar_t szc)
{
spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
uint_t noreloc;
if (npgs == 1) {
ASSERT(pp->p_szc == 0);
ASSERT(pp->p_next == pp);
ASSERT(pp->p_prev == pp);
return;
}
ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
ASSERT(pp->p_prev == (pp + (npgs - 1)));
/*
* Check list of pages.
*/
noreloc = PP_ISNORELOC(pp);
while (npgs--) {
if (npgs != 0) {
ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
ASSERT(pp->p_next == (pp + 1));
}
ASSERT(pp->p_szc == szc);
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
ASSERT(pp->p_vnode == NULL);
ASSERT(PP_ISNORELOC(pp) == noreloc);
pp = pp->p_next;
}
}
#endif /* DEBUG */
void
page_freelist_lock(int mnode)
{
int i;
for (i = 0; i < NPC_MUTEX; i++) {
mutex_enter(FPC_MUTEX(mnode, i));
mutex_enter(CPC_MUTEX(mnode, i));
}
}
void
page_freelist_unlock(int mnode)
{
int i;
for (i = 0; i < NPC_MUTEX; i++) {
mutex_exit(FPC_MUTEX(mnode, i));
mutex_exit(CPC_MUTEX(mnode, i));
}
}
/*
* add pp to the specified page list. Defaults to head of the page list
* unless PG_LIST_TAIL is specified.
*/
void
page_list_add(page_t *pp, int flags)
{
page_t **ppp;
kmutex_t *pcm;
uint_t bin, mtype;
int mnode;
ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
ASSERT(PP_ISFREE(pp));
ASSERT(!hat_page_is_mapped(pp));
ASSERT(hat_page_getshare(pp) == 0);
/*
* Large pages should be freed via page_list_add_pages().
*/
ASSERT(pp->p_szc == 0);
/*
* Don't need to lock the freelist first here
* because the page isn't on the freelist yet.
* This means p_szc can't change on us.
*/
bin = PP_2_BIN(pp);
mnode = PP_2_MEM_NODE(pp);
mtype = PP_2_MTYPE(pp);
if (flags & PG_LIST_ISINIT) {
/*
* PG_LIST_ISINIT is set during system startup (ie. single
* threaded), add a page to the free list and add to the
* the free region counters w/o any locking
*/
ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
/* inline version of page_add() */
if (*ppp != NULL) {
pp->p_next = *ppp;
pp->p_prev = (*ppp)->p_prev;
(*ppp)->p_prev = pp;
pp->p_prev->p_next = pp;
} else
*ppp = pp;
page_ctr_add_internal(mnode, pp, flags);
} else {
pcm = PC_BIN_MUTEX(mnode, bin, flags);
if (flags & PG_FREE_LIST) {
ASSERT(PP_ISAGED(pp));
ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
} else {
ASSERT(pp->p_vnode);
ASSERT((pp->p_offset & PAGEOFFSET) == 0);
ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
}
mutex_enter(pcm);
page_add(ppp, pp);
if (flags & PG_LIST_TAIL)
*ppp = (*ppp)->p_next;
/*
* Add counters before releasing pcm mutex to avoid a race with
* page_freelist_coalesce and page_freelist_fill.
*/
page_ctr_add(pp, flags);
mutex_exit(pcm);
}
#if defined(__sparc)
if (PP_ISNORELOC(pp)) {
kcage_freemem_add(1);
}
#endif
/*
* It is up to the caller to unlock the page!
*/
ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
}
#ifdef __sparc
/*
* This routine is only used by kcage_init during system startup.
* It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
* without the overhead of taking locks and updating counters.
*/
void
page_list_noreloc_startup(page_t *pp)
{
page_t **ppp;
uint_t bin;
int mnode;
int mtype;
int flags = PG_LIST_ISCAGE;
/*
* If this is a large page on the freelist then
* break it up into smaller pages.
*/
if (pp->p_szc != 0)
page_boot_demote(pp);
/*
* Get list page is currently on.
*/
bin = PP_2_BIN(pp);
mnode = PP_2_MEM_NODE(pp);
mtype = PP_2_MTYPE(pp);
ASSERT(mtype == MTYPE_RELOC);
ASSERT(pp->p_szc == 0);
if (PP_ISAGED(pp)) {
ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
flags |= PG_FREE_LIST;
} else {
ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
flags |= PG_CACHE_LIST;
}
ASSERT(*ppp != NULL);
/*
* Delete page from current list.
*/
if (*ppp == pp)
*ppp = pp->p_next; /* go to next page */
if (*ppp == pp) {
*ppp = NULL; /* page list is gone */
} else {
pp->p_prev->p_next = pp->p_next;
pp->p_next->p_prev = pp->p_prev;
}
/* LINTED */
PLCNT_DECR(pp, mnode, 0, flags);
/*
* Set no reloc for cage initted pages.
*/
PP_SETNORELOC(pp);
mtype = PP_2_MTYPE(pp);
ASSERT(mtype == MTYPE_NORELOC);
/*
* Get new list for page.
*/
if (PP_ISAGED(pp)) {
ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
} else {
ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
}
/*
* Insert page on new list.
*/
if (*ppp == NULL) {
*ppp = pp;
pp->p_next = pp->p_prev = pp;
} else {
pp->p_next = *ppp;
pp->p_prev = (*ppp)->p_prev;
(*ppp)->p_prev = pp;
pp->p_prev->p_next = pp;
}
/* LINTED */
PLCNT_INCR(pp, mnode, 0, flags);
/*
* Update cage freemem counter
*/
atomic_add_long(&kcage_freemem, 1);
}
#else /* __sparc */
/* ARGSUSED */
void
page_list_noreloc_startup(page_t *pp)
{
panic("page_list_noreloc_startup: should be here only for sparc");
}
#endif
void
page_list_add_pages(page_t *pp, int flags)
{
kmutex_t *pcm;
pgcnt_t pgcnt;
uint_t bin, mtype, i;
int mnode;
/* default to freelist/head */
ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
CHK_LPG(pp, pp->p_szc);
VM_STAT_ADD(vmm_vmstats.pc_list_add_pages[pp->p_szc]);
bin = PP_2_BIN(pp);
mnode = PP_2_MEM_NODE(pp);
mtype = PP_2_MTYPE(pp);
if (flags & PG_LIST_ISINIT) {
ASSERT(pp->p_szc == mmu_page_sizes - 1);
page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
ASSERT(!PP_ISNORELOC(pp));
PLCNT_INCR(pp, mnode, pp->p_szc, flags);
} else {
ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
mutex_enter(pcm);
page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
page_ctr_add(pp, PG_FREE_LIST);
mutex_exit(pcm);
pgcnt = page_get_pagecnt(pp->p_szc);
#if defined(__sparc)
if (PP_ISNORELOC(pp))
kcage_freemem_add(pgcnt);
#endif
for (i = 0; i < pgcnt; i++, pp++)
page_unlock(pp);
}
}
/*
* During boot, need to demote a large page to base
* pagesize pages for seg_kmem for use in boot_alloc()
*/
void
page_boot_demote(page_t *pp)
{
ASSERT(pp->p_szc != 0);
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
(void) page_demote(PP_2_MEM_NODE(pp),
PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
PC_FREE);
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
ASSERT(pp->p_szc == 0);
}
/*
* Take a particular page off of whatever freelist the page
* is claimed to be on.
*
* NOTE: Only used for PAGESIZE pages.
*/
void
page_list_sub(page_t *pp, int flags)
{
int bin;
uint_t mtype;
int mnode;
kmutex_t *pcm;
page_t **ppp;
ASSERT(PAGE_EXCL(pp));
ASSERT(PP_ISFREE(pp));
/*
* The p_szc field can only be changed by page_promote()
* and page_demote(). Only free pages can be promoted and
* demoted and the free list MUST be locked during these
* operations. So to prevent a race in page_list_sub()
* between computing which bin of the freelist lock to
* grab and actually grabing the lock we check again that
* the bin we locked is still the correct one. Notice that
* the p_szc field could have actually changed on us but
* if the bin happens to still be the same we are safe.
*/
try_again:
bin = PP_2_BIN(pp);
mnode = PP_2_MEM_NODE(pp);
pcm = PC_BIN_MUTEX(mnode, bin, flags);
mutex_enter(pcm);
if (PP_2_BIN(pp) != bin) {
mutex_exit(pcm);
goto try_again;
}
mtype = PP_2_MTYPE(pp);
if (flags & PG_FREE_LIST) {
ASSERT(PP_ISAGED(pp));
ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
} else {
ASSERT(!PP_ISAGED(pp));
ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
}
/*
* Common PAGESIZE case.
*
* Note that we locked the freelist. This prevents
* any page promotion/demotion operations. Therefore
* the p_szc will not change until we drop pcm mutex.
*/
if (pp->p_szc == 0) {
page_sub(ppp, pp);
/*
* Subtract counters before releasing pcm mutex
* to avoid race with page_freelist_coalesce.
*/
page_ctr_sub(pp, flags);
mutex_exit(pcm);
#if defined(__sparc)
if (PP_ISNORELOC(pp)) {
kcage_freemem_sub(1);
}
#endif
return;
}
/*
* Large pages on the cache list are not supported.
*/
if (flags & PG_CACHE_LIST)
panic("page_list_sub: large page on cachelist");
/*
* Slow but rare.
*
* Somebody wants this particular page which is part
* of a large page. In this case we just demote the page
* if it's on the freelist.
*
* We have to drop pcm before locking the entire freelist.
* Once we have re-locked the freelist check to make sure
* the page hasn't already been demoted or completely
* freed.
*/
mutex_exit(pcm);
page_freelist_lock(mnode);
if (pp->p_szc != 0) {
/*
* Large page is on freelist.
*/
(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
}
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
ASSERT(pp->p_szc == 0);
/*
* Subtract counters before releasing pcm mutex
* to avoid race with page_freelist_coalesce.
*/
bin = PP_2_BIN(pp);
mtype = PP_2_MTYPE(pp);
ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
page_sub(ppp, pp);
page_ctr_sub(pp, flags);
page_freelist_unlock(mnode);
#if defined(__sparc)
if (PP_ISNORELOC(pp)) {
kcage_freemem_sub(1);
}
#endif
}
void
page_list_sub_pages(page_t *pp, uint_t szc)
{
kmutex_t *pcm;
uint_t bin, mtype;
int mnode;
ASSERT(PAGE_EXCL(pp));
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
/*
* See comment in page_list_sub().
*/
try_again:
bin = PP_2_BIN(pp);
mnode = PP_2_MEM_NODE(pp);
pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
mutex_enter(pcm);
if (PP_2_BIN(pp) != bin) {
mutex_exit(pcm);
goto try_again;
}
VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages1[pp->p_szc]);
/*
* If we're called with a page larger than szc or it got
* promoted above szc before we locked the freelist then
* drop pcm and re-lock entire freelist. If page still larger
* than szc then demote it.
*/
if (pp->p_szc > szc) {
VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages2[pp->p_szc]);
mutex_exit(pcm);
pcm = NULL;
page_freelist_lock(mnode);
if (pp->p_szc > szc) {
VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages3[pp->p_szc]);
(void) page_demote(mnode,
PFN_BASE(pp->p_pagenum, pp->p_szc),
pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
}
bin = PP_2_BIN(pp);
}
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
ASSERT(pp->p_szc <= szc);
ASSERT(pp == PP_PAGEROOT(pp));
mtype = PP_2_MTYPE(pp);
if (pp->p_szc != 0) {
page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
CHK_LPG(pp, pp->p_szc);
} else {
page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
}
page_ctr_sub(pp, PG_FREE_LIST);
if (pcm != NULL) {
mutex_exit(pcm);
} else {
page_freelist_unlock(mnode);
}
#if defined(__sparc)
if (PP_ISNORELOC(pp)) {
pgcnt_t pgcnt;
pgcnt = page_get_pagecnt(pp->p_szc);
kcage_freemem_sub(pgcnt);
}
#endif
}
/*
* Add the page to the front of a linked list of pages
* using the p_next & p_prev pointers for the list.
* The caller is responsible for protecting the list pointers.
*/
void
mach_page_add(page_t **ppp, page_t *pp)
{
if (*ppp == NULL) {
pp->p_next = pp->p_prev = pp;
} else {
pp->p_next = *ppp;
pp->p_prev = (*ppp)->p_prev;
(*ppp)->p_prev = pp;
pp->p_prev->p_next = pp;
}
*ppp = pp;
}
/*
* Remove this page from a linked list of pages
* using the p_next & p_prev pointers for the list.
*
* The caller is responsible for protecting the list pointers.
*/
void
mach_page_sub(page_t **ppp, page_t *pp)
{
ASSERT(PP_ISFREE(pp));
if (*ppp == NULL || pp == NULL)
panic("mach_page_sub");
if (*ppp == pp)
*ppp = pp->p_next; /* go to next page */
if (*ppp == pp)
*ppp = NULL; /* page list is gone */
else {
pp->p_prev->p_next = pp->p_next;
pp->p_next->p_prev = pp->p_prev;
}
pp->p_prev = pp->p_next = pp; /* make pp a list of one */
}
/*
* Routine fsflush uses to gradually coalesce the free list into larger pages.
*/
void
page_promote_size(page_t *pp, uint_t cur_szc)
{
pfn_t pfn;
int mnode;
int idx;
int new_szc = cur_szc + 1;
int full = FULL_REGION_CNT(new_szc);
pfn = page_pptonum(pp);
mnode = PFN_2_MEM_NODE(pfn);
page_freelist_lock(mnode);
idx = PNUM_TO_IDX(mnode, new_szc, pfn);
if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
(void) page_promote(mnode, pfn, new_szc, PC_FREE);
page_freelist_unlock(mnode);
}
static uint_t page_promote_err;
static uint_t page_promote_noreloc_err;
/*
* Create a single larger page (of szc new_szc) from smaller contiguous pages
* for the given mnode starting at pfnum. Pages involved are on the freelist
* before the call and may be returned to the caller if requested, otherwise
* they will be placed back on the freelist.
* If flags is PC_ALLOC, then the large page will be returned to the user in
* a state which is consistent with a page being taken off the freelist. If
* we failed to lock the new large page, then we will return NULL to the
* caller and put the large page on the freelist instead.
* If flags is PC_FREE, then the large page will be placed on the freelist,
* and NULL will be returned.
* The caller is responsible for locking the freelist as well as any other
* accounting which needs to be done for a returned page.
*
* RFE: For performance pass in pp instead of pfnum so
* we can avoid excessive calls to page_numtopp_nolock().
* This would depend on an assumption that all contiguous
* pages are in the same memseg so we can just add/dec
* our pp.
*
* Lock ordering:
*
* There is a potential but rare deadlock situation
* for page promotion and demotion operations. The problem
* is there are two paths into the freelist manager and
* they have different lock orders:
*
* page_create()
* lock freelist
* page_lock(EXCL)
* unlock freelist
* return
* caller drops page_lock
*
* page_free() and page_reclaim()
* caller grabs page_lock(EXCL)
*
* lock freelist
* unlock freelist
* drop page_lock
*
* What prevents a thread in page_create() from deadlocking
* with a thread freeing or reclaiming the same page is the
* page_trylock() in page_get_freelist(). If the trylock fails
* it skips the page.
*
* The lock ordering for promotion and demotion is the same as
* for page_create(). Since the same deadlock could occur during
* page promotion and freeing or reclaiming of a page on the
* cache list we might have to fail the operation and undo what
* have done so far. Again this is rare.
*/
page_t *
page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
{
page_t *pp, *pplist, *tpp, *start_pp;
pgcnt_t new_npgs, npgs;
uint_t bin;
pgcnt_t tmpnpgs, pages_left;
uint_t mtype;
uint_t noreloc;
uint_t i;
int which_list;
ulong_t index;
kmutex_t *phm;
/*
* General algorithm:
* Find the starting page
* Walk each page struct removing it from the freelist,
* and linking it to all the other pages removed.
* Once all pages are off the freelist,
* walk the list, modifying p_szc to new_szc and what
* ever other info needs to be done to create a large free page.
* According to the flags, either return the page or put it
* on the freelist.
*/
start_pp = page_numtopp_nolock(pfnum);
ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
new_npgs = page_get_pagecnt(new_szc);
ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
/*
* Loop through smaller pages to confirm that all pages
* give the same result for PP_ISNORELOC().
* We can check this reliably here as the protocol for setting
* P_NORELOC requires pages to be taken off the free list first.
*/
for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
if (pp == start_pp) {
/* First page, set requirement. */
noreloc = PP_ISNORELOC(pp);
} else if (noreloc != PP_ISNORELOC(pp)) {
page_promote_noreloc_err++;
page_promote_err++;
return (NULL);
}
}
pages_left = new_npgs;
pplist = NULL;
pp = start_pp;
/* Loop around coalescing the smaller pages into a big page. */
while (pages_left) {
/*
* Remove from the freelist.
*/
ASSERT(PP_ISFREE(pp));
bin = PP_2_BIN(pp);
ASSERT(mnode == PP_2_MEM_NODE(pp));
mtype = PP_2_MTYPE(pp);
if (PP_ISAGED(pp)) {
/*
* PG_FREE_LIST
*/
if (pp->p_szc) {
page_vpsub(&PAGE_FREELISTS(mnode,
pp->p_szc, bin, mtype), pp);
} else {
mach_page_sub(&PAGE_FREELISTS(mnode, 0,
bin, mtype), pp);
}
which_list = PG_FREE_LIST;
} else {
ASSERT(pp->p_szc == 0);
/*
* PG_CACHE_LIST
*
* Since this page comes from the
* cachelist, we must destroy the
* vnode association.
*/
if (!page_trylock(pp, SE_EXCL)) {
goto fail_promote;
}
/*
* We need to be careful not to deadlock
* with another thread in page_lookup().
* The page_lookup() thread could be holding
* the same phm that we need if the two
* pages happen to hash to the same phm lock.
* At this point we have locked the entire
* freelist and page_lookup() could be trying
* to grab a freelist lock.
*/
index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
phm = PAGE_HASH_MUTEX(index);
if (!mutex_tryenter(phm)) {
page_unlock(pp);
goto fail_promote;
}
mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
page_hashout(pp, phm);
mutex_exit(phm);
PP_SETAGED(pp);
page_unlock(pp);
which_list = PG_CACHE_LIST;
}
page_ctr_sub(pp, which_list);
/*
* Concatenate the smaller page(s) onto
* the large page list.
*/
tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
pages_left -= npgs;
tpp = pp;
while (npgs--) {
tpp->p_szc = new_szc;
tpp = tpp->p_next;
}
page_list_concat(&pplist, &pp);
pp += tmpnpgs;
}
CHK_LPG(pplist, new_szc);
/*
* return the page to the user if requested
* in the properly locked state.
*/
if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
return (pplist);
}
/*
* Otherwise place the new large page on the freelist
*/
bin = PP_2_BIN(pplist);
mnode = PP_2_MEM_NODE(pplist);
mtype = PP_2_MTYPE(pplist);
page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
page_ctr_add(pplist, PG_FREE_LIST);
return (NULL);
fail_promote:
/*
* A thread must have still been freeing or
* reclaiming the page on the cachelist.
* To prevent a deadlock undo what we have
* done sofar and return failure. This
* situation can only happen while promoting
* PAGESIZE pages.
*/
page_promote_err++;
while (pplist) {
pp = pplist;
mach_page_sub(&pplist, pp);
pp->p_szc = 0;
bin = PP_2_BIN(pp);
mtype = PP_2_MTYPE(pp);
mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
page_ctr_add(pp, PG_FREE_LIST);
}
return (NULL);
}
/*
* Break up a large page into smaller size pages.
* Pages involved are on the freelist before the call and may
* be returned to the caller if requested, otherwise they will
* be placed back on the freelist.
* The caller is responsible for locking the freelist as well as any other
* accounting which needs to be done for a returned page.
* If flags is not PC_ALLOC, the color argument is ignored, and thus
* technically, any value may be passed in but PC_NO_COLOR is the standard
* which should be followed for clarity's sake.
*/
page_t *
page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
int color, int flags)
{
page_t *pp, *pplist, *npplist;
pgcnt_t npgs, n;
uint_t bin;
uint_t mtype;
page_t *ret_pp = NULL;
ASSERT(cur_szc != 0);
ASSERT(new_szc < cur_szc);
pplist = page_numtopp_nolock(pfnum);
ASSERT(pplist != NULL);
ASSERT(pplist->p_szc == cur_szc);
bin = PP_2_BIN(pplist);
ASSERT(mnode == PP_2_MEM_NODE(pplist));
mtype = PP_2_MTYPE(pplist);
page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
CHK_LPG(pplist, cur_szc);
page_ctr_sub(pplist, PG_FREE_LIST);
/*
* Number of PAGESIZE pages for smaller new_szc
* page.
*/
npgs = page_get_pagecnt(new_szc);
while (pplist) {
pp = pplist;
ASSERT(pp->p_szc == cur_szc);
/*
* We either break it up into PAGESIZE pages or larger.
*/
if (npgs == 1) { /* PAGESIZE case */
mach_page_sub(&pplist, pp);
ASSERT(pp->p_szc == cur_szc);
ASSERT(new_szc == 0);
ASSERT(mnode == PP_2_MEM_NODE(pp));
pp->p_szc = new_szc;
bin = PP_2_BIN(pp);
if ((bin == color) && (flags == PC_ALLOC) &&
(ret_pp == NULL) &&
page_trylock_cons(pp, SE_EXCL)) {
ret_pp = pp;
} else {
mtype = PP_2_MTYPE(pp);
mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
mtype), pp);
page_ctr_add(pp, PG_FREE_LIST);
}
} else {
/*
* Break down into smaller lists of pages.
*/
page_list_break(&pplist, &npplist, npgs);
pp = pplist;
n = npgs;
while (n--) {
ASSERT(pp->p_szc == cur_szc);
pp->p_szc = new_szc;
pp = pp->p_next;
}
CHK_LPG(pplist, new_szc);
bin = PP_2_BIN(pplist);
ASSERT(mnode == PP_2_MEM_NODE(pp));
if ((bin == color) && (flags == PC_ALLOC) &&
(ret_pp == NULL) &&
page_trylock_cons(pp, SE_EXCL)) {
ret_pp = pp;
} else {
mtype = PP_2_MTYPE(pp);
page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
bin, mtype), pplist);
page_ctr_add(pplist, PG_FREE_LIST);
}
pplist = npplist;
}
}
return (ret_pp);
}
int mpss_coalesce_disable = 0;
/*
* Coalesce free pages into a page of the given szc and color if possible.
* Return the pointer to the page created, otherwise, return NULL.
*/
static page_t *
page_freelist_coalesce(int mnode, uchar_t szc, int color)
{
int r; /* region size */
int idx, full, i;
pfn_t pfnum;
size_t len;
size_t buckets_to_check;
pgcnt_t cands;
page_t *ret_pp;
int color_stride;
VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);
if (mpss_coalesce_disable) {
return (NULL);
}
r = szc;
PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
if (cands == 0) {
VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
return (NULL);
}
full = FULL_REGION_CNT(r);
color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
page_colors;
/* Prevent page_counters dynamic memory from being freed */
rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
len = PAGE_COUNTERS_ENTRIES(mnode, r);
buckets_to_check = len / color_stride;
idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
ASSERT((idx % color_stride) == color);
idx += color_stride;
if (idx >= len)
idx = color;
for (i = 0; i < buckets_to_check; i++) {
if (PAGE_COUNTERS(mnode, r, idx) == full) {
pfnum = IDX_TO_PNUM(mnode, r, idx);
ASSERT(pfnum >= mem_node_config[mnode].physbase &&
pfnum < mem_node_config[mnode].physmax);
/*
* RFE: For performance maybe we can do something less
* brutal than locking the entire freelist. So far
* this doesn't seem to be a performance problem?
*/
page_freelist_lock(mnode);
if (PAGE_COUNTERS(mnode, r, idx) != full) {
VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
goto skip_this_one;
}
ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
if (ret_pp != NULL) {
PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
idx;
page_freelist_unlock(mnode);
rw_exit(&page_ctrs_rwlock[mnode]);
#if defined(__sparc)
if (PP_ISNORELOC(ret_pp)) {
pgcnt_t npgs;
npgs = page_get_pagecnt(ret_pp->p_szc);
kcage_freemem_sub(npgs);
}
#endif
return (ret_pp);
}
skip_this_one:
page_freelist_unlock(mnode);
/*
* No point looking for another page if we've
* already tried all of the ones that
* page_ctr_cands indicated. Stash off where we left
* off.
* Note: this is not exact since we don't hold the
* page_freelist_locks before we initially get the
* value of cands for performance reasons, but should
* be a decent approximation.
*/
if (--cands == 0) {
PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
idx;
break;
}
}
idx += color_stride;
if (idx >= len)
idx = color;
}
rw_exit(&page_ctrs_rwlock[mnode]);
VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
return (NULL);
}
/*
* For the given mnode, promote as many small pages to large pages as possible.
*/
void
page_freelist_coalesce_all(int mnode)
{
int r; /* region size */
int idx, full;
pfn_t pfnum;
size_t len;
VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
if (mpss_coalesce_disable) {
return;
}
/*
* Lock the entire freelist and coalesce what we can.
*
* Always promote to the largest page possible
* first to reduce the number of page promotions.
*/
rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
page_freelist_lock(mnode);
for (r = mmu_page_sizes - 1; r > 0; r--) {
pgcnt_t cands;
PGCTRS_CANDS_GETVALUE(mnode, r, cands);
if (cands == 0) {
VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
continue;
}
full = FULL_REGION_CNT(r);
len = PAGE_COUNTERS_ENTRIES(mnode, r);
for (idx = 0; idx < len; idx++) {
if (PAGE_COUNTERS(mnode, r, idx) == full) {
pfnum = IDX_TO_PNUM(mnode, r, idx);
ASSERT(pfnum >=
mem_node_config[mnode].physbase &&
pfnum <
mem_node_config[mnode].physmax);
(void) page_promote(mnode, pfnum, r, PC_FREE);
}
}
}
page_freelist_unlock(mnode);
rw_exit(&page_ctrs_rwlock[mnode]);
}
/*
* This is where all polices for moving pages around
* to different page size free lists is implemented.
* Returns 1 on success, 0 on failure.
*
* So far these are the priorities for this algorithm in descending
* order:
*
* 1) When servicing a request try to do so with a free page
* from next size up. Helps defer fragmentation as long
* as possible.
*
* 2) Page coalesce on demand. Only when a freelist
* larger than PAGESIZE is empty and step 1
* will not work since all larger size lists are
* also empty.
*
* If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
*/
page_t *
page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
{
uchar_t nszc = szc + 1;
int bin;
page_t *pp, *firstpp;
page_t *ret_pp = NULL;
ASSERT(szc < mmu_page_sizes);
/*
* First try to break up a larger page to fill
* current size freelist.
*/
while (nszc < mmu_page_sizes) {
/*
* If page found then demote it.
*/
bin = page_convert_color(szc, nszc, color);
if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
page_freelist_lock(mnode);
firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
/*
* If pfnhi is not PFNNULL, look for large page below
* pfnhi. PFNNULL signifies no pfn requirement.
*/
if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
do {
pp = pp->p_vpnext;
if (pp == firstpp) {
pp = NULL;
break;
}
} while (pp->p_pagenum >= pfnhi);
}
if (pp) {
ASSERT(pp->p_szc == nszc);
ret_pp = page_demote(mnode, pp->p_pagenum,
pp->p_szc, szc, color, PC_ALLOC);
if (ret_pp) {
page_freelist_unlock(mnode);
#if defined(__sparc)
if (PP_ISNORELOC(ret_pp)) {
pgcnt_t npgs;
npgs = page_get_pagecnt(
ret_pp->p_szc);
kcage_freemem_sub(npgs);
}
#endif
return (ret_pp);
}
}
page_freelist_unlock(mnode);
}
nszc++;
}
/*
* Ok that didn't work. Time to coalesce.
*/
if (szc != 0) {
ret_pp = page_freelist_coalesce(mnode, szc, color);
}
return (ret_pp);
}
/*
* Helper routine used only by the freelist code to lock
* a page. If the page is a large page then it succeeds in
* locking all the constituent pages or none at all.
* Returns 1 on sucess, 0 on failure.
*/
static int
page_trylock_cons(page_t *pp, se_t se)
{
page_t *tpp, *first_pp = pp;
/*
* Fail if can't lock first or only page.
*/
if (!page_trylock(pp, se)) {
return (0);
}
/*
* PAGESIZE: common case.
*/
if (pp->p_szc == 0) {
return (1);
}
/*
* Large page case.
*/
tpp = pp->p_next;
while (tpp != pp) {
if (!page_trylock(tpp, se)) {
/*
* On failure unlock what we
* have locked so far.
*/
while (first_pp != tpp) {
page_unlock(first_pp);
first_pp = first_pp->p_next;
}
return (0);
}
tpp = tpp->p_next;
}
return (1);
}
page_t *
page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
uint_t flags)
{
kmutex_t *pcm;
int i, fill_tried, fill_marker;
page_t *pp, *first_pp;
uint_t bin_marker;
int colors, cpucolors;
uchar_t nszc;
uint_t nszc_color_shift;
int nwaybins = 0, nwaycnt;
ASSERT(szc < mmu_page_sizes);
VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
/* LINTED */
MTYPE_START(mnode, mtype, flags);
if (mtype < 0) { /* mnode foes not have memory in mtype range */
VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
return (NULL);
}
/*
* Set how many physical colors for this page size.
*/
colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
page_colors;
nszc = MIN(szc + 1, mmu_page_sizes - 1);
nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);
/* cpu_page_colors is non-zero if a page color may be in > 1 bin */
cpucolors = cpu_page_colors;
/*
* adjust cpucolors to possibly check additional 'equivalent' bins
* to try to minimize fragmentation of large pages by delaying calls
* to page_freelist_fill.
*/
if (colorequiv > 1) {
int equivcolors = colors / colorequiv;
if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
cpucolors = equivcolors;
}
ASSERT(colors <= page_colors);
ASSERT(colors);
ASSERT((colors & (colors - 1)) == 0);
ASSERT(bin < colors);
/*
* Only hold one freelist lock at a time, that way we
* can start anywhere and not have to worry about lock
* ordering.
*/
big_try_again:
fill_tried = 0;
nwaycnt = 0;
for (i = 0; i <= colors; i++) {
try_again:
ASSERT(bin < colors);
if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
mutex_enter(pcm);
pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
if (pp != NULL) {
/*
* These were set before the page
* was put on the free list,
* they must still be set.
*/
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
ASSERT(pp->p_vnode == NULL);
ASSERT(pp->p_hash == NULL);
ASSERT(pp->p_offset == (u_offset_t)-1);
ASSERT(pp->p_szc == szc);
ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
/*
* Walk down the hash chain.
* 8k pages are linked on p_next
* and p_prev fields. Large pages
* are a contiguous group of
* constituent pages linked together
* on their p_next and p_prev fields.
* The large pages are linked together
* on the hash chain using p_vpnext
* p_vpprev of the base constituent
* page of each large page.
*/
first_pp = pp;
while (!page_trylock_cons(pp, SE_EXCL)) {
if (szc == 0) {
pp = pp->p_next;
} else {
pp = pp->p_vpnext;
}
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
ASSERT(pp->p_vnode == NULL);
ASSERT(pp->p_hash == NULL);
ASSERT(pp->p_offset == (u_offset_t)-1);
ASSERT(pp->p_szc == szc);
ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
mnode);
if (pp == first_pp) {
pp = NULL;
break;
}
}
if (pp) {
ASSERT(mtype == PP_2_MTYPE(pp));
ASSERT(pp->p_szc == szc);
if (szc == 0) {
page_sub(&PAGE_FREELISTS(mnode,
szc, bin, mtype), pp);
} else {
page_vpsub(&PAGE_FREELISTS(
mnode, szc, bin, mtype),
pp);
CHK_LPG(pp, szc);
}
page_ctr_sub(pp, PG_FREE_LIST);
if ((PP_ISFREE(pp) == 0) ||
(PP_ISAGED(pp) == 0))
panic("free page is not. pp %p",
(void *)pp);
mutex_exit(pcm);
#if defined(__sparc)
ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
(flags & PG_NORELOC) == 0);
if (PP_ISNORELOC(pp)) {
pgcnt_t npgs;
npgs = page_get_pagecnt(szc);
kcage_freemem_sub(npgs);
}
#endif
VM_STAT_ADD(vmm_vmstats.
pgmf_allocok[szc]);
return (pp);
}
}
mutex_exit(pcm);
}
/*
* Wow! The initial bin is empty.
* If specific color is needed, check if page color may be
* in other bins. cpucolors is:
* 0 if the colors for this cpu is equal to page_colors.
* This means that pages with a particular color are in a
* single bin.
* -1 if colors of cpus (cheetah+) are heterogenous. Need to
* first determine the colors for the current cpu.
* >0 colors of all cpus are homogenous and < page_colors
*/
if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
if (!nwaybins) {
/*
* cpucolors is negative if ecache setsizes
* are heterogenous. determine colors for this
* particular cpu.
*/
if (cpucolors < 0) {
cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
ASSERT(cpucolors > 0);
nwaybins = colors / cpucolors;
} else {
nwaybins = colors / cpucolors;
ASSERT(szc > 0 || nwaybins > 1);
}
if (nwaybins < 2)
cpucolors = 0;
}
if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
nwaycnt++;
bin = (bin + (colors / nwaybins)) &
(colors - 1);
if (nwaycnt < nwaybins) {
goto try_again;
}
}
/* back to initial color if fall-thru */
}
/*
* color bins are all empty if color match. Try and satisfy
* the request by breaking up or coalescing pages from
* a different size freelist of the correct color that
* satisfies the ORIGINAL color requested. If that
* fails then try pages of the same size but different
* colors assuming we are not called with
* PG_MATCH_COLOR.
*/
if (!fill_tried) {
fill_tried = 1;
fill_marker = bin >> nszc_color_shift;
pp = page_freelist_fill(szc, bin, mnode, mtype,
PFNNULL);
if (pp != NULL) {
return (pp);
}
}
if (flags & PG_MATCH_COLOR)
break;
/*
* Select next color bin to try.
*/
if (szc == 0) {
/*
* PAGESIZE page case.
*/
if (i == 0) {
bin = (bin + BIN_STEP) & page_colors_mask;
bin_marker = bin;
} else {
bin = (bin + vac_colors) & page_colors_mask;
if (bin == bin_marker) {
bin = (bin + 1) & page_colors_mask;
bin_marker = bin;
}
}
} else {
/*
* Large page case.
*/
bin = (bin + 1) & (colors - 1);
}
/*
* If bin advanced to the next color bin of the
* next larger pagesize, there is a chance the fill
* could succeed.
*/
if (fill_marker != (bin >> nszc_color_shift))
fill_tried = 0;
}
#if defined(__sparc)
if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
(kcage_freemem >= kcage_lotsfree)) {
/*
* The Cage is ON and with plenty of free mem, and
* we're willing to check for a NORELOC page if we
* couldn't find a RELOC page, so spin again.
*/
flags |= PG_NORELOC;
mtype = MTYPE_NORELOC;
goto big_try_again;
}
#else
if (flags & PGI_MT_RANGE) {
/* cycle through range of mtypes */
MTYPE_NEXT(mnode, mtype, flags);
if (mtype >= 0)
goto big_try_again;
}
#endif
VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
return (NULL);
}
/*
* Returns the count of free pages for 'pp' with size code 'szc'.
* Note: This function does not return an exact value as the page freelist
* locks are not held and thus the values in the page_counters may be
* changing as we walk through the data.
*/
static int
page_freecnt(int mnode, page_t *pp, uchar_t szc)
{
pgcnt_t pgfree;
pgcnt_t cnt;
ssize_t r = szc; /* region size */
ssize_t idx;
int i;
int full, range;
/* Make sure pagenum passed in is aligned properly */
ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
ASSERT(szc > 0);
/* Prevent page_counters dynamic memory from being freed */
rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
cnt = PAGE_COUNTERS(mnode, r, idx);
pgfree = cnt << PNUM_SHIFT(r - 1);
range = FULL_REGION_CNT(szc);
/* Check for completely full region */
if (cnt == range) {
rw_exit(&page_ctrs_rwlock[mnode]);
return (pgfree);
}
while (--r > 0) {
idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
full = FULL_REGION_CNT(r);
for (i = 0; i < range; i++, idx++) {
cnt = PAGE_COUNTERS(mnode, r, idx);
/*
* If cnt here is full, that means we have already
* accounted for these pages earlier.
*/
if (cnt != full) {
pgfree += (cnt << PNUM_SHIFT(r - 1));
}
}
range *= full;
}
rw_exit(&page_ctrs_rwlock[mnode]);
return (pgfree);
}
/*
* Called from page_geti_contig_pages to exclusively lock constituent pages
* starting from 'spp' for page size code 'szc'.
*
* If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
* region needs to be greater than or equal to the threshold.
*/
static int
page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
{
pgcnt_t pgcnt = PNUM_SIZE(szc);
pgcnt_t pgfree, i;
page_t *pp;
VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
goto skipptcpcheck;
/*
* check if there are sufficient free pages available before attempting
* to trylock. Count is approximate as page counters can change.
*/
pgfree = page_freecnt(mnode, spp, szc);
/* attempt to trylock if there are sufficient already free pages */
if (pgfree < pgcnt/ptcpthreshold) {
VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
return (0);
}
skipptcpcheck:
for (i = 0; i < pgcnt; i++) {
pp = &spp[i];
if (!page_trylock(pp, SE_EXCL)) {
VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
while (--i != (pgcnt_t)-1) {
pp = &spp[i];
ASSERT(PAGE_EXCL(pp));
page_unlock(pp);
}
return (0);
}
ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
!PP_ISFREE(pp)) {
VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
ASSERT(i == 0);
page_unlock(pp);
return (0);
}
if (PP_ISNORELOC(pp)) {
VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
while (i != (pgcnt_t)-1) {
pp = &spp[i];
ASSERT(PAGE_EXCL(pp));
page_unlock(pp);
i--;
}
return (0);
}
}
VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
return (1);
}
/*
* Claim large page pointed to by 'pp'. 'pp' is the starting set
* of 'szc' constituent pages that had been locked exclusively previously.
* Will attempt to relocate constituent pages in use.
*/
static page_t *
page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
{
spgcnt_t pgcnt, npgs, i;
page_t *targpp, *rpp, *hpp;
page_t *replpp = NULL;
page_t *pplist = NULL;
ASSERT(pp != NULL);
pgcnt = page_get_pagecnt(szc);
while (pgcnt) {
ASSERT(PAGE_EXCL(pp));
ASSERT(!PP_ISNORELOC(pp));
if (PP_ISFREE(pp)) {
/*
* If this is a PG_FREE_LIST page then its
* size code can change underneath us due to
* page promotion or demotion. As an optimzation
* use page_list_sub_pages() instead of
* page_list_sub().
*/
if (PP_ISAGED(pp)) {
page_list_sub_pages(pp, szc);
if (pp->p_szc == szc) {
return (pp);
}
ASSERT(pp->p_szc < szc);
npgs = page_get_pagecnt(pp->p_szc);
hpp = pp;
for (i = 0; i < npgs; i++, pp++) {
pp->p_szc = szc;
}
page_list_concat(&pplist, &hpp);
pgcnt -= npgs;
continue;
}
ASSERT(!PP_ISAGED(pp));
ASSERT(pp->p_szc == 0);
page_list_sub(pp, PG_CACHE_LIST);
page_hashout(pp, NULL);
PP_SETAGED(pp);
pp->p_szc = szc;
page_list_concat(&pplist, &pp);
pp++;
pgcnt--;
continue;
}
npgs = page_get_pagecnt(pp->p_szc);
/*
* page_create_wait freemem accounting done by caller of
* page_get_freelist and not necessary to call it prior to
* calling page_get_replacement_page.
*
* page_get_replacement_page can call page_get_contig_pages
* to acquire a large page (szc > 0); the replacement must be
* smaller than the contig page size to avoid looping or
* szc == 0 and PGI_PGCPSZC0 is set.
*/
if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
replpp = page_get_replacement_page(pp, NULL, 0);
if (replpp) {
npgs = page_get_pagecnt(pp->p_szc);
ASSERT(npgs <= pgcnt);
targpp = pp;
}
}
/*
* If replacement is NULL or do_page_relocate fails, fail
* coalescing of pages.
*/
if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
&npgs, NULL) != 0)) {
/*
* Unlock un-processed target list
*/
while (pgcnt--) {
ASSERT(PAGE_EXCL(pp));
page_unlock(pp);
pp++;
}
/*
* Free the processed target list.
*/
while (pplist) {
pp = pplist;
page_sub(&pplist, pp);
ASSERT(PAGE_EXCL(pp));
ASSERT(pp->p_szc == szc);
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
pp->p_szc = 0;
page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
page_unlock(pp);
}
if (replpp != NULL)
page_free_replacement_page(replpp);
return (NULL);
}
ASSERT(pp == targpp);
/* LINTED */
ASSERT(hpp = pp); /* That's right, it's an assignment */
pp += npgs;
pgcnt -= npgs;
while (npgs--) {
ASSERT(PAGE_EXCL(targpp));
ASSERT(!PP_ISFREE(targpp));
ASSERT(!PP_ISNORELOC(targpp));
PP_SETFREE(targpp);
ASSERT(PP_ISAGED(targpp));
ASSERT(targpp->p_szc < szc || (szc == 0 &&
(flags & PGI_PGCPSZC0)));
targpp->p_szc = szc;
targpp = targpp->p_next;
rpp = replpp;
ASSERT(rpp != NULL);
page_sub(&replpp, rpp);
ASSERT(PAGE_EXCL(rpp));
ASSERT(!PP_ISFREE(rpp));
page_unlock(rpp);
}
ASSERT(targpp == hpp);
ASSERT(replpp == NULL);
page_list_concat(&pplist, &targpp);
}
CHK_LPG(pplist, szc);
return (pplist);
}
/*
* Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
* of 0 means nothing left after trim.
*/
int
trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
{
pfn_t kcagepfn;
int decr;
int rc = 0;
if (PP_ISNORELOC(mseg->pages)) {
if (PP_ISNORELOC(mseg->epages - 1) == 0) {
/* lower part of this mseg inside kernel cage */
decr = kcage_current_pfn(&kcagepfn);
/* kernel cage may have transitioned past mseg */
if (kcagepfn >= mseg->pages_base &&
kcagepfn < mseg->pages_end) {
ASSERT(decr == 0);
*lo = kcagepfn;
*hi = MIN(pfnhi,
(mseg->pages_end - 1));
rc = 1;
}
}
/* else entire mseg in the cage */
} else {
if (PP_ISNORELOC(mseg->epages - 1)) {
/* upper part of this mseg inside kernel cage */
decr = kcage_current_pfn(&kcagepfn);
/* kernel cage may have transitioned past mseg */
if (kcagepfn >= mseg->pages_base &&
kcagepfn < mseg->pages_end) {
ASSERT(decr);
*hi = kcagepfn;
*lo = MAX(pfnlo, mseg->pages_base);
rc = 1;
}
} else {
/* entire mseg outside of kernel cage */
*lo = MAX(pfnlo, mseg->pages_base);
*hi = MIN(pfnhi, (mseg->pages_end - 1));
rc = 1;
}
}
return (rc);
}
/*
* called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
* page with size code 'szc'. Claiming such a page requires acquiring
* exclusive locks on all constituent pages (page_trylock_contig_pages),
* relocating pages in use and concatenating these constituent pages into a
* large page.
*
* The page lists do not have such a large page and page_freelist_fill has
* already failed to demote larger pages and/or coalesce smaller free pages.
*
* 'flags' may specify PG_COLOR_MATCH which would limit the search of large
* pages with the same color as 'bin'.
*
* 'pfnflag' specifies the subset of the pfn range to search.
*/
static page_t *
page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
pfn_t pfnlo, pfn_t pfnhi, int pfnflag)
{
struct memseg *mseg;
pgcnt_t szcpgcnt = page_get_pagecnt(szc);
pgcnt_t szcpgmask = szcpgcnt - 1;
pfn_t randpfn;
page_t *pp, *randpp, *endpp;
uint_t colors;
pfn_t hi, lo;
uint_t skip;
ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
if ((pfnhi - pfnlo) + 1 < szcpgcnt)
return (NULL);
ASSERT(szc < mmu_page_sizes);
colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
page_colors;
ASSERT(bin < colors);
/*
* trim the pfn range to search based on pfnflag. pfnflag is set
* when there have been previous page_get_contig_page failures to
* limit the search.
*
* The high bit in pfnflag specifies the number of 'slots' in the
* pfn range and the remainder of pfnflag specifies which slot.
* For example, a value of 1010b would mean the second slot of
* the pfn range that has been divided into 8 slots.
*/
if (pfnflag > 1) {
int slots = 1 << (highbit(pfnflag) - 1);
int slotid = pfnflag & (slots - 1);
pgcnt_t szcpages;
int slotlen;
pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
pfnhi = pfnhi & ~(szcpgcnt - 1);
szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
slotlen = howmany(szcpages, slots);
pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
ASSERT(pfnlo < pfnhi);
if (pfnhi > pfnlo + (slotlen * szcpgcnt))
pfnhi = pfnlo + (slotlen * szcpgcnt);
}
memsegs_lock(0);
/*
* loop through memsegs to look for contig page candidates
*/
for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
/* no overlap */
continue;
}
if (mseg->pages_end - mseg->pages_base < szcpgcnt)
/* mseg too small */
continue;
/* trim off kernel cage pages from pfn range */
if (kcage_on) {
if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
continue;
} else {
lo = MAX(pfnlo, mseg->pages_base);
hi = MIN(pfnhi, (mseg->pages_end - 1));
}
/* round to szcpgcnt boundaries */
lo = P2ROUNDUP(lo, szcpgcnt);
hi = hi & ~(szcpgcnt - 1);
if (hi <= lo)
continue;
/*
* set lo to point to the pfn for the desired bin. Large
* page sizes may only have a single page color
*/
if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
uint_t lobin;
/*
* factor in colorequiv to check additional
* 'equivalent' bins.
*/
if (colorequiv > 1 && colors > colorequiv)
colors = colors / colorequiv;
/* determine bin that lo currently points to */
lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;
/*
* set lo to point at appropriate color and set skip
* to arrive at the next szc page of the same color.
*/
lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;
skip = colors * szcpgcnt;
} else {
/* check all pages starting from lo */
skip = szcpgcnt;
}
if (hi <= lo)
/* mseg cannot satisfy color request */
continue;
/* randomly choose a point between lo and hi to begin search */
randpfn = (pfn_t)GETTICK();
randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
randpp = mseg->pages + (randpfn - mseg->pages_base);
ASSERT(randpp->p_pagenum == randpfn);
pp = randpp;
endpp = mseg->pages + (hi - mseg->pages_base);
ASSERT(randpp + szcpgcnt <= endpp);
do {
ASSERT(!(pp->p_pagenum & szcpgmask));
ASSERT((flags & PG_MATCH_COLOR) == 0 ||
colorequiv > 1 ||
PP_2_BIN(pp) == bin);
if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
/* pages unlocked by page_claim on failure */
if (page_claim_contig_pages(pp, szc, flags)) {
memsegs_unlock(0);
return (pp);
}
}
pp += skip;
if (pp >= endpp) {
/* start from the beginning */
pp = mseg->pages + (lo - mseg->pages_base);
ASSERT(pp->p_pagenum == lo);
ASSERT(pp + szcpgcnt <= endpp);
}
} while (pp != randpp);
}
memsegs_unlock(0);
return (NULL);
}
/*
* controlling routine that searches through physical memory in an attempt to
* claim a large page based on the input parameters.
* on the page free lists.
*
* calls page_geti_contig_pages with an initial pfn range from the mnode
* and mtype. page_geti_contig_pages will trim off the parts of the pfn range
* that overlaps with the kernel cage or does not match the requested page
* color if PG_MATCH_COLOR is set. Since this search is very expensive,
* page_geti_contig_pages may further limit the search range based on
* previous failure counts (pgcpfailcnt[]).
*
* for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
* pagesize page that satisfies mtype.
*/
page_t *
page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
uint_t flags)
{
pfn_t pfnlo, pfnhi; /* contig pages pfn range */
page_t *pp;
int pfnflag = 0; /* no limit on search if 0 */
VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
/* LINTED */
MTYPE_START(mnode, mtype, flags);
if (mtype < 0) { /* mnode does not have memory in mtype range */
VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
return (NULL);
}
ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
/* do not limit search and ignore color if hi pri */
if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
pfnflag = pgcpfailcnt[szc];
/* remove color match to improve chances */
if (flags & PGI_PGCPHIPRI || pfnflag)
flags &= ~PG_MATCH_COLOR;
do {
/* get pfn range based on mnode and mtype */
MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
ASSERT(pfnhi >= pfnlo);
pp = page_geti_contig_pages(mnode, bin, szc, flags,
pfnlo, pfnhi, pfnflag);
if (pp != NULL) {
pfnflag = pgcpfailcnt[szc];
if (pfnflag) {
/* double the search size */
pgcpfailcnt[szc] = pfnflag >> 1;
}
VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
return (pp);
}
/* LINTED */
} while ((flags & PGI_MT_RANGE) &&
(MTYPE_NEXT(mnode, mtype, flags) >= 0));
VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
return (NULL);
}
/*
* Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
*
* Does its own locking and accounting.
* If PG_MATCH_COLOR is set, then NULL will be returned if there are no
* pages of the proper color even if there are pages of a different color.
*
* Finds a page, removes it, THEN locks it.
*/
/*ARGSUSED*/
page_t *
page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
{
struct as *as = seg->s_as;
page_t *pp = NULL;
ulong_t bin;
uchar_t szc;
int mnode;
int mtype;
page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
lgrp_mnode_cookie_t lgrp_cookie;
page_get_func = page_get_mnode_freelist;
/*
* If we aren't passed a specific lgroup, or passed a freed lgrp
* assume we wish to allocate near to the current thread's home.
*/
if (!LGRP_EXISTS(lgrp))
lgrp = lgrp_home_lgrp();
if (kcage_on) {
if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
kcage_freemem < kcage_throttlefree + btop(size) &&
curthread != kcage_cageout_thread) {
/*
* Set a "reserve" of kcage_throttlefree pages for
* PG_PANIC and cageout thread allocations.
*
* Everybody else has to serialize in
* page_create_get_something() to get a cage page, so
* that we don't deadlock cageout!
*/
return (NULL);
}
} else {
flags &= ~PG_NORELOC;
flags |= PGI_NOCAGE;
}
/* LINTED */
MTYPE_INIT(mtype, vp, vaddr, flags);
/*
* Convert size to page size code.
*/
if ((szc = page_szc(size)) == (uchar_t)-1)
panic("page_get_freelist: illegal page size request");
ASSERT(szc < mmu_page_sizes);
VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
/* LINTED */
AS_2_BIN(as, seg, vp, vaddr, bin);
/* bin is for base pagesize color - convert if larger pagesize. */
if (szc)
bin = page_convert_color(0, szc, bin);
/*
* Try to get a local page first, but try remote if we can't
* get a page of the right color.
*/
pgretry:
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
pp = page_get_func(mnode, bin, mtype, szc, flags);
if (pp != NULL) {
VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
DTRACE_PROBE4(page__get,
lgrp_t *, lgrp,
int, mnode,
ulong_t, bin,
uint_t, flags);
return (pp);
}
}
ASSERT(pp == NULL);
/*
* for non-SZC0 PAGESIZE requests, check cachelist before checking
* remote free lists. Caller expected to call page_get_cachelist which
* will check local cache lists and remote free lists.
*/
if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
return (NULL);
}
ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
/*
* Try to get a non-local freelist page.
*/
LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
pp = page_get_func(mnode, bin, mtype, szc, flags);
if (pp != NULL) {
DTRACE_PROBE4(page__get,
lgrp_t *, lgrp,
int, mnode,
ulong_t, bin,
uint_t, flags);
VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
return (pp);
}
}
ASSERT(pp == NULL);
/*
* when the cage is off chances are page_get_contig_pages() will fail
* to lock a large page chunk therefore when the cage is off it's not
* called by default. this can be changed via /etc/system.
*
* page_get_contig_pages() also called to acquire a base pagesize page
* for page_create_get_something().
*/
if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
(kcage_on || pg_lpgcreate_nocage || szc == 0) &&
(page_get_func != page_get_contig_pages)) {
VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
page_get_func = page_get_contig_pages;
goto pgretry;
}
if (pgcplimitsearch && page_get_func == page_get_contig_pages)
pgcpfailcnt[szc]++;
VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
return (NULL);
}
/*
* Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
*
* Does its own locking.
* If PG_MATCH_COLOR is set, then NULL will be returned if there are no
* pages of the proper color even if there are pages of a different color.
* Otherwise, scan the bins for ones with pages. For each bin with pages,
* try to lock one of them. If no page can be locked, try the
* next bin. Return NULL if a page can not be found and locked.
*
* Finds a pages, trys to lock it, then removes it.
*/
/*ARGSUSED*/
page_t *
page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
{
page_t *pp;
struct as *as = seg->s_as;
ulong_t bin;
/*LINTED*/
int mnode;
int mtype;
lgrp_mnode_cookie_t lgrp_cookie;
/*
* If we aren't passed a specific lgroup, or pasased a freed lgrp
* assume we wish to allocate near to the current thread's home.
*/
if (!LGRP_EXISTS(lgrp))
lgrp = lgrp_home_lgrp();
if (!kcage_on) {
flags &= ~PG_NORELOC;
flags |= PGI_NOCAGE;
}
if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
kcage_freemem <= kcage_throttlefree) {
/*
* Reserve kcage_throttlefree pages for critical kernel
* threads.
*
* Everybody else has to go to page_create_get_something()
* to get a cage page, so we don't deadlock cageout.
*/
return (NULL);
}
/* LINTED */
AS_2_BIN(as, seg, vp, vaddr, bin);
ASSERT(bin <= page_colors_mask);
/* LINTED */
MTYPE_INIT(mtype, vp, vaddr, flags);
VM_STAT_ADD(vmm_vmstats.pgc_alloc);
/*
* Try local cachelists first
*/
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
if (pp != NULL) {
VM_STAT_ADD(vmm_vmstats.pgc_allocok);
DTRACE_PROBE4(page__get,
lgrp_t *, lgrp,
int, mnode,
ulong_t, bin,
uint_t, flags);
return (pp);
}
}
lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
/*
* Try freelists/cachelists that are farther away
* This is our only chance to allocate remote pages for PAGESIZE
* requests.
*/
LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
pp = page_get_mnode_freelist(mnode, bin, mtype,
0, flags);
if (pp != NULL) {
VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
DTRACE_PROBE4(page__get,
lgrp_t *, lgrp,
int, mnode,
ulong_t, bin,
uint_t, flags);
return (pp);
}
pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
if (pp != NULL) {
VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
DTRACE_PROBE4(page__get,
lgrp_t *, lgrp,
int, mnode,
ulong_t, bin,
uint_t, flags);
return (pp);
}
}
VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
return (NULL);
}
page_t *
page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
{
kmutex_t *pcm;
int i;
page_t *pp;
page_t *first_pp;
uint_t bin_marker;
int nwaybins, nwaycnt;
int cpucolors;
VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
/* LINTED */
MTYPE_START(mnode, mtype, flags);
if (mtype < 0) { /* mnode does not have memory in mtype range */
VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
return (NULL);
}
nwaybins = 0;
cpucolors = cpu_page_colors;
/*
* adjust cpucolors to possibly check additional 'equivalent' bins
* to try to minimize fragmentation of large pages by delaying calls
* to page_freelist_fill.
*/
if (colorequiv > 1) {
int equivcolors = page_colors / colorequiv;
if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
cpucolors = equivcolors;
}
/*
* Only hold one cachelist lock at a time, that way we
* can start anywhere and not have to worry about lock
* ordering.
*/
big_try_again:
nwaycnt = 0;
for (i = 0; i <= page_colors; i++) {
if (PAGE_CACHELISTS(mnode, bin, mtype)) {
pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
mutex_enter(pcm);
pp = PAGE_CACHELISTS(mnode, bin, mtype);
if (pp != NULL) {
first_pp = pp;
ASSERT(pp->p_vnode);
ASSERT(PP_ISAGED(pp) == 0);
ASSERT(pp->p_szc == 0);
ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
while (!page_trylock(pp, SE_EXCL)) {
pp = pp->p_next;
ASSERT(pp->p_szc == 0);
if (pp == first_pp) {
/*
* We have searched the
* complete list!
* And all of them (might
* only be one) are locked.
* This can happen since
* these pages can also be
* found via the hash list.
* When found via the hash
* list, they are locked
* first, then removed.
* We give up to let the
* other thread run.
*/
pp = NULL;
break;
}
ASSERT(pp->p_vnode);
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp) == 0);
ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
mnode);
}
if (pp) {
page_t **ppp;
/*
* Found and locked a page.
* Pull it off the list.
*/
ASSERT(mtype == PP_2_MTYPE(pp));
ppp = &PAGE_CACHELISTS(mnode, bin,
mtype);
page_sub(ppp, pp);
/*
* Subtract counters before releasing
* pcm mutex to avoid a race with
* page_freelist_coalesce and
* page_freelist_fill.
*/
page_ctr_sub(pp, PG_CACHE_LIST);
mutex_exit(pcm);
ASSERT(pp->p_vnode);
ASSERT(PP_ISAGED(pp) == 0);
#if defined(__sparc)
ASSERT(!kcage_on ||
(flags & PG_NORELOC) == 0 ||
PP_ISNORELOC(pp));
if (PP_ISNORELOC(pp)) {
kcage_freemem_sub(1);
}
#endif
VM_STAT_ADD(vmm_vmstats.
pgmc_allocok);
return (pp);
}
}
mutex_exit(pcm);
}
/*
* Wow! The initial bin is empty or no page in the bin could
* be locked.
*
* If specific color is needed, check if page color may be in
* other bins.
*/
if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
if (!nwaybins) {
if (cpucolors < 0) {
cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
ASSERT(cpucolors > 0);
nwaybins = page_colors / cpucolors;
if (nwaybins < 2)
cpucolors = 0;
} else {
nwaybins = page_colors / cpucolors;
ASSERT(nwaybins > 1);
}
}
if (++nwaycnt >= nwaybins) {
break;
}
bin = (bin + (page_colors / nwaybins)) &
page_colors_mask;
continue;
}
if (i == 0) {
bin = (bin + BIN_STEP) & page_colors_mask;
bin_marker = bin;
} else {
bin = (bin + vac_colors) & page_colors_mask;
if (bin == bin_marker) {
bin = (bin + 1) & page_colors_mask;
bin_marker = bin;
}
}
}
#if defined(__sparc)
if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
(kcage_freemem >= kcage_lotsfree)) {
/*
* The Cage is ON and with plenty of free mem, and
* we're willing to check for a NORELOC page if we
* couldn't find a RELOC page, so spin again.
*/
flags |= PG_NORELOC;
mtype = MTYPE_NORELOC;
goto big_try_again;
}
#else
if (flags & PGI_MT_RANGE) {
MTYPE_NEXT(mnode, mtype, flags);
if (mtype >= 0)
goto big_try_again;
}
#endif
VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
return (NULL);
}
#ifdef DEBUG
#define REPL_PAGE_STATS
#endif /* DEBUG */
#ifdef REPL_PAGE_STATS
struct repl_page_stats {
uint_t ngets;
uint_t ngets_noreloc;
uint_t npgr_noreloc;
uint_t nnopage_first;
uint_t nnopage;
uint_t nhashout;
uint_t nnofree;
uint_t nnext_pp;
} repl_page_stats;
#define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1)
#else /* REPL_PAGE_STATS */
#define REPL_STAT_INCR(v)
#endif /* REPL_PAGE_STATS */
int pgrppgcp;
/*
* The freemem accounting must be done by the caller.
* First we try to get a replacement page of the same size as like_pp,
* if that is not possible, then we just get a set of discontiguous
* PAGESIZE pages.
*/
page_t *
page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp,
uint_t pgrflags)
{
page_t *like_pp;
page_t *pp, *pplist;
page_t *pl = NULL;
ulong_t bin;
int mnode, page_mnode;
int szc;
spgcnt_t npgs, pg_cnt;
pfn_t pfnum;
int mtype;
int flags = 0;
lgrp_mnode_cookie_t lgrp_cookie;
REPL_STAT_INCR(ngets);
like_pp = orig_like_pp;
ASSERT(PAGE_EXCL(like_pp));
szc = like_pp->p_szc;
npgs = page_get_pagecnt(szc);
/*
* Now we reset like_pp to the base page_t.
* That way, we won't walk past the end of this 'szc' page.
*/
pfnum = PFN_BASE(like_pp->p_pagenum, szc);
like_pp = page_numtopp_nolock(pfnum);
ASSERT(like_pp->p_szc == szc);
if (PP_ISNORELOC(like_pp)) {
ASSERT(kcage_on);
REPL_STAT_INCR(ngets_noreloc);
flags = PGI_RELOCONLY;
} else if (pgrflags & PGR_NORELOC) {
ASSERT(kcage_on);
REPL_STAT_INCR(npgr_noreloc);
flags = PG_NORELOC;
}
/*
* Kernel pages must always be replaced with the same size
* pages, since we cannot properly handle demotion of kernel
* pages.
*/
if (like_pp->p_vnode == &kvp)
pgrflags |= PGR_SAMESZC;
/* LINTED */
MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode);
while (npgs) {
pplist = NULL;
for (;;) {
pg_cnt = page_get_pagecnt(szc);
bin = PP_2_BIN(like_pp);
ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
ASSERT(pg_cnt <= npgs);
/*
* If an lgroup was specified, try to get the
* page from that lgroup.
*/
if (LGRP_EXISTS(lgrp)) {
/* Try the lgroup's freelists first */
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
LGRP_SRCH_LOCAL);
while ((pplist == NULL) &&
(mnode = lgrp_memnode_choose(&lgrp_cookie))
!= -1) {
pplist = page_get_mnode_freelist(
mnode, bin, mtype, szc,
flags);
}
/*
* Now try it's cachelists if this is a
* small page. Don't need to do it for
* larger ones since page_freelist_coalesce()
* already failed.
*/
if (pplist != NULL || szc != 0)
break;
/* Now try it's cachelists */
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
LGRP_SRCH_LOCAL);
while ((pplist == NULL) &&
(mnode = lgrp_memnode_choose(&lgrp_cookie))
!= -1) {
pplist = page_get_mnode_cachelist(
bin, flags, mnode, mtype);
}
if (pplist != NULL) {
page_hashout(pplist, NULL);
PP_SETAGED(pplist);
REPL_STAT_INCR(nhashout);
break;
}
/* Done looking in this lgroup. Bail out. */
break;
}
ASSERT(!LGRP_EXISTS(lgrp));
/*
* No lgroup was specified, so just try to get the
* page as close to like_pp's mnode as possible.
* First try the local freelist...
*/
mnode = PP_2_MEM_NODE(like_pp);
pplist = page_get_mnode_freelist(mnode, bin,
mtype, szc, flags);
if (pplist != NULL)
break;
REPL_STAT_INCR(nnofree);
/*
* ...then the local cachelist. Don't need to do it for
* larger pages cause page_freelist_coalesce() already
* failed there anyway.
*/
if (szc == 0) {
pplist = page_get_mnode_cachelist(bin, flags,
mnode, mtype);
if (pplist != NULL) {
page_hashout(pplist, NULL);
PP_SETAGED(pplist);
REPL_STAT_INCR(nhashout);
break;
}
}
/* Now try remote freelists */
page_mnode = mnode;
lgrp =
lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
LGRP_SRCH_HIER);
while (pplist == NULL &&
(mnode = lgrp_memnode_choose(&lgrp_cookie))
!= -1) {
/*
* Skip local mnode.
*/
if ((mnode == page_mnode) ||
(mem_node_config[mnode].exists == 0))
continue;
pplist = page_get_mnode_freelist(mnode,
bin, mtype, szc, flags);
}
if (pplist != NULL)
break;
/* Now try remote cachelists */
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
LGRP_SRCH_HIER);
while (pplist == NULL && szc == 0) {
mnode = lgrp_memnode_choose(&lgrp_cookie);
if (mnode == -1)
break;
/*
* Skip local mnode.
*/
if ((mnode == page_mnode) ||
(mem_node_config[mnode].exists == 0))
continue;
pplist = page_get_mnode_cachelist(bin,
flags, mnode, mtype);
if (pplist != NULL) {
page_hashout(pplist, NULL);
PP_SETAGED(pplist);
REPL_STAT_INCR(nhashout);
break;
}
}
/*
* Break out of while loop under the following cases:
* - If we successfully got a page.
* - If pgrflags specified only returning a specific
* page size and we could not find that page size.
* - If we could not satisfy the request with PAGESIZE
* or larger pages.
*/
if (pplist != NULL || szc == 0)
break;
if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
/* try to find contig page */
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
LGRP_SRCH_HIER);
while ((pplist == NULL) &&
(mnode =
lgrp_memnode_choose(&lgrp_cookie))
!= -1) {
pplist = page_get_contig_pages(
mnode, bin, mtype, szc,
flags | PGI_PGCPHIPRI);
}
break;
}
/*
* The correct thing to do here is try the next
* page size down using szc--. Due to a bug
* with the processing of HAT_RELOAD_SHARE
* where the sfmmu_ttecnt arrays of all
* hats sharing an ISM segment don't get updated,
* using intermediate size pages for relocation
* can lead to continuous page faults.
*/
szc = 0;
}
if (pplist != NULL) {
DTRACE_PROBE4(page__get,
lgrp_t *, lgrp,
int, mnode,
ulong_t, bin,
uint_t, flags);
while (pplist != NULL && pg_cnt--) {
ASSERT(pplist != NULL);
pp = pplist;
page_sub(&pplist, pp);
PP_CLRFREE(pp);
PP_CLRAGED(pp);
page_list_concat(&pl, &pp);
npgs--;
like_pp = like_pp + 1;
REPL_STAT_INCR(nnext_pp);
}
ASSERT(pg_cnt == 0);
} else {
break;
}
}
if (npgs) {
/*
* We were unable to allocate the necessary number
* of pages.
* We need to free up any pl.
*/
REPL_STAT_INCR(nnopage);
page_free_replacement_page(pl);
return (NULL);
} else {
return (pl);
}
}
/*
* demote a free large page to it's constituent pages
*/
void
page_demote_free_pages(page_t *pp)
{
int mnode;
ASSERT(pp != NULL);
ASSERT(PAGE_LOCKED(pp));
ASSERT(PP_ISFREE(pp));
ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
mnode = PP_2_MEM_NODE(pp);
page_freelist_lock(mnode);
if (pp->p_szc != 0) {
(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
}
page_freelist_unlock(mnode);
ASSERT(pp->p_szc == 0);
}