common/vm/vm_pagelist.c

	vm_pagelist.c revision ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*  All Rights Reserved   */

/*
 * Portions of this source code were derived from Berkeley 4.3 BSD
 * under license from the Regents of the University of California.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * This file contains common functions to access and manage the page lists.
 * Many of these routines originated from platform dependent modules
 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
 * a platform independent manner.
 *
 * vm/vm_dep.h provides for platform specific support.
 */

#include <sys/types.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/sysmacros.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/seg_kmem.h>
#include <vm/seg_vn.h>
#include <sys/memnode.h>
#include <vm/vm_dep.h>
#include <sys/lgrp.h>
#include <sys/mem_config.h>
#include <sys/callb.h>
#include <sys/mem_cage.h>
#include <sys/sdt.h>

extern uint_t   vac_colors;

#define MAX_PRAGMA_ALIGN    128

/* vm_cpu_data0 for the boot cpu before kmem is initialized */

#if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
#pragma align   L2CACHE_ALIGN_MAX(vm_cpu_data0)
#else
#pragma align   MAX_PRAGMA_ALIGN(vm_cpu_data0)
#endif
char        vm_cpu_data0[VM_CPU_DATA_PADSIZE];

/*
 * number of page colors equivalent to reqested color in page_get routines.
 * If set, keeps large pages intact longer and keeps MPO allocation
 * from the local mnode in favor of acquiring the 'correct' page color from
 * a demoted large page or from a remote mnode.
 */
uint_t  colorequiv;

/*
 * color equivalency mask for each page size.
 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
 * High 4 bits determine the number of high order bits of the color to ignore.
 * Low 4 bits determines number of low order bits of color to ignore (it's only
 * relevant for hashed index based page coloring).
 */
uchar_t colorequivszc[MMU_PAGE_SIZES];

/*
 * if set, specifies the percentage of large pages that are free from within
 * a large page region before attempting to lock those pages for
 * page_get_contig_pages processing.
 *
 * Should be turned on when kpr is available when page_trylock_contig_pages
 * can be more selective.
 */

int ptcpthreshold;

/*
 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 * Enabled by default via pgcplimitsearch.
 *
 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
 * bound. This upper bound range guarantees:
 *    - all large page 'slots' will be searched over time
 *    - the minimum (1) large page candidates considered on each pgcp call
 *    - count doesn't wrap around to 0
 */
pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
int pgcplimitsearch = 1;

#define PGCPFAILMAX     (1 << (highbit(physinstalled) - 1))
#define SETPGCPFAILCNT(szc)                     \
    if (++pgcpfailcnt[szc] >= PGCPFAILMAX)              \
        pgcpfailcnt[szc] = PGCPFAILMAX / 2;

#ifdef VM_STATS
struct vmm_vmstats_str  vmm_vmstats;

#endif /* VM_STATS */

#if defined(__sparc)
#define LPGCREATE   0
#else
/* enable page_get_contig_pages */
#define LPGCREATE   1
#endif

int pg_contig_disable;
int pg_lpgcreate_nocage = LPGCREATE;

/*
 * page_freelist_split pfn flag to signify no hi pfn requirement.
 */
#define PFNNULL     0

/* Flags involved in promotion and demotion routines */
#define PC_FREE     0x1 /* put page on freelist */
#define PC_ALLOC    0x2 /* return page for allocation */

/*
 * Flag for page_demote to be used with PC_FREE to denote that we don't care
 * what the color is as the color parameter to the function is ignored.
 */
#define PC_NO_COLOR (-1)

/* mtype value for page_promote to use when mtype does not matter */
#define PC_MTYPE_ANY    (-1)

/*
 * page counters candidates info
 * See page_ctrs_cands comment below for more details.
 * fields are as follows:
 *  pcc_pages_free:     # pages which freelist coalesce can create
 *  pcc_color_free:     pointer to page free counts per color
 */
typedef struct pcc_info {
    pgcnt_t pcc_pages_free;
    pgcnt_t *pcc_color_free;
} pcc_info_t;

/*
 * On big machines it can take a long time to check page_counters
 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 * updated sum of all elements of the corresponding page_counters arrays.
 * page_freelist_coalesce() searches page_counters only if an appropriate
 * element of page_ctrs_cands array is greater than 0.
 *
 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
 */
pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];

/*
 * Return in val the total number of free pages which can be created
 * for the given mnode (m), mrange (g), and region size (r)
 */
#define PGCTRS_CANDS_GETVALUE(m, g, r, val) {               \
    int i;                              \
    val = 0;                            \
    for (i = 0; i < NPC_MUTEX; i++) {               \
        val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;    \
    }                               \
}

/*
 * Return in val the total number of free pages which can be created
 * for the given mnode (m), mrange (g), region size (r), and color (c)
 */
#define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {           \
    int i;                              \
    val = 0;                            \
    ASSERT((c) < PAGE_GET_PAGECOLORS(r));               \
    for (i = 0; i < NPC_MUTEX; i++) {               \
        val +=                          \
        page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];  \
    }                               \
}

/*
 * We can only allow a single thread to update a counter within the physical
 * range of the largest supported page size. That is the finest granularity
 * possible since the counter values are dependent on each other
 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 * ctr_mutex lock index for a particular physical range.
 */
static kmutex_t *ctr_mutex[NPC_MUTEX];

#define PP_CTR_LOCK_INDX(pp)                        \
    (((pp)->p_pagenum >>                        \
        (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))

#define INVALID_COLOR 0xffffffff
#define INVALID_MASK  0xffffffff

/*
 * Local functions prototypes.
 */

void page_ctr_add(int, int, page_t *, int);
void page_ctr_add_internal(int, int, page_t *, int);
void page_ctr_sub(int, int, page_t *, int);
void page_ctr_sub_internal(int, int, page_t *, int);
void page_freelist_lock(int);
void page_freelist_unlock(int);
page_t *page_promote(int, pfn_t, uchar_t, int, int);
page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
page_t *page_freelist_split(uchar_t,
    uint_t, int, int, pfn_t, page_list_walker_t *);
page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
static int page_trylock_cons(page_t *pp, se_t se);

/*
 * The page_counters array below is used to keep track of free contiguous
 * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 * This contains an array of counters, the size of the array, a shift value
 * used to convert a pagenum into a counter array index or vice versa, as
 * well as a cache of the last successful index to be promoted to a larger
 * page size.  As an optimization, we keep track of the last successful index
 * to be promoted per page color for the given size region, and this is
 * allocated dynamically based upon the number of colors for a given
 * region size.
 *
 * Conceptually, the page counters are represented as:
 *
 *  page_counters[region_size][mnode]
 *
 *  region_size:    size code of a candidate larger page made up
 *          of contiguous free smaller pages.
 *
 *  page_counters[region_size][mnode].hpm_counters[index]:
 *      represents how many (region_size - 1) pages either
 *      exist or can be created within the given index range.
 *
 * Let's look at a sparc example:
 *  If we want to create a free 512k page, we look at region_size 2
 *  for the mnode we want.  We calculate the index and look at a specific
 *  hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 *  this location, it means that 8 64k pages either exist or can be created
 *  from 8K pages in order to make a single free 512k page at the given
 *  index.  Note that when a region is full, it will contribute to the
 *  counts in the region above it.  Thus we will not know what page
 *  size the free pages will be which can be promoted to this new free
 *  page unless we look at all regions below the current region.
 */

/*
 * Note: hpmctr_t is defined in platform vm_dep.h
 * hw_page_map_t contains all the information needed for the page_counters
 * logic. The fields are as follows:
 *
 *  hpm_counters:   dynamically allocated array to hold counter data
 *  hpm_entries:    entries in hpm_counters
 *  hpm_shift:  shift for pnum/array index conv
 *  hpm_base:   PFN mapped to counter index 0
 *  hpm_color_current:  last index in counter array for this color at
 *              which we successfully created a large page
 */
typedef struct hw_page_map {
    hpmctr_t    *hpm_counters;
    size_t      hpm_entries;
    int     hpm_shift;
    pfn_t       hpm_base;
    size_t      *hpm_color_current[MAX_MNODE_MRANGES];
} hw_page_map_t;

/*
 * Element zero is not used, but is allocated for convenience.
 */
static hw_page_map_t *page_counters[MMU_PAGE_SIZES];

/*
 * Cached value of MNODE_RANGE_CNT(mnode).
 * This is a function call in x86.
 */
static int mnode_nranges[MAX_MEM_NODES];
static int mnode_maxmrange[MAX_MEM_NODES];

/*
 * The following macros are convenient ways to get access to the individual
 * elements of the page_counters arrays.  They can be used on both
 * the left side and right side of equations.
 */
#define PAGE_COUNTERS(mnode, rg_szc, idx)           \
    (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])

#define PAGE_COUNTERS_COUNTERS(mnode, rg_szc)           \
    (page_counters[(rg_szc)][(mnode)].hpm_counters)

#define PAGE_COUNTERS_SHIFT(mnode, rg_szc)          \
    (page_counters[(rg_szc)][(mnode)].hpm_shift)

#define PAGE_COUNTERS_ENTRIES(mnode, rg_szc)            \
    (page_counters[(rg_szc)][(mnode)].hpm_entries)

#define PAGE_COUNTERS_BASE(mnode, rg_szc)           \
    (page_counters[(rg_szc)][(mnode)].hpm_base)

#define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)     \
    (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])

#define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)   \
    (page_counters[(rg_szc)][(mnode)].              \
    hpm_color_current[(mrange)][(color)])

#define PNUM_TO_IDX(mnode, rg_szc, pnum)            \
    (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>    \
        PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))

#define IDX_TO_PNUM(mnode, rg_szc, index)           \
    (PAGE_COUNTERS_BASE((mnode), (rg_szc)) +        \
        ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))

/*
 * Protects the hpm_counters and hpm_color_current memory from changing while
 * looking at page counters information.
 * Grab the write lock to modify what these fields point at.
 * Grab the read lock to prevent any pointers from changing.
 * The write lock can not be held during memory allocation due to a possible
 * recursion deadlock with trying to grab the read lock while the
 * write lock is already held.
 */
krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];


/*
 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
 */
void
cpu_vm_data_init(struct cpu *cp)
{
    if (cp == CPU0) {
        cp->cpu_vm_data = (void *)&vm_cpu_data0;
    } else {
        void    *kmptr;
        int align;
        size_t  sz;

        align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
        sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
        kmptr = kmem_zalloc(sz, KM_SLEEP);
        cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
        ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
        ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
    }
}

/*
 * free cpu_vm_data
 */
void
cpu_vm_data_destroy(struct cpu *cp)
{
    if (cp->cpu_seqid && cp->cpu_vm_data) {
        ASSERT(cp != CPU0);
        kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
            ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
    }
    cp->cpu_vm_data = NULL;
}


/*
 * page size to page size code
 */
int
page_szc(size_t pagesize)
{
    int i = 0;

    while (hw_page_array[i].hp_size) {
        if (pagesize == hw_page_array[i].hp_size)
            return (i);
        i++;
    }
    return (-1);
}

/*
 * page size to page size code with the restriction that it be a supported
 * user page size.  If it's not a supported user page size, -1 will be returned.
 */
int
page_szc_user_filtered(size_t pagesize)
{
    int szc = page_szc(pagesize);
    if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
        return (szc);
    }
    return (-1);
}

/*
 * Return how many page sizes are available for the user to use.  This is
 * what the hardware supports and not based upon how the OS implements the
 * support of different page sizes.
 */
uint_t
page_num_user_pagesizes(void)
{
    return (mmu_exported_page_sizes);
}

uint_t
page_num_pagesizes(void)
{
    return (mmu_page_sizes);
}

/*
 * returns the count of the number of base pagesize pages associated with szc
 */
pgcnt_t
page_get_pagecnt(uint_t szc)
{
    if (szc >= mmu_page_sizes)
        panic("page_get_pagecnt: out of range %d", szc);
    return (hw_page_array[szc].hp_pgcnt);
}

size_t
page_get_pagesize(uint_t szc)
{
    if (szc >= mmu_page_sizes)
        panic("page_get_pagesize: out of range %d", szc);
    return (hw_page_array[szc].hp_size);
}

/*
 * Return the size of a page based upon the index passed in.  An index of
 * zero refers to the smallest page size in the system, and as index increases
 * it refers to the next larger supported page size in the system.
 * Note that szc and userszc may not be the same due to unsupported szc's on
 * some systems.
 */
size_t
page_get_user_pagesize(uint_t userszc)
{
    uint_t szc = USERSZC_2_SZC(userszc);

    if (szc >= mmu_page_sizes)
        panic("page_get_user_pagesize: out of range %d", szc);
    return (hw_page_array[szc].hp_size);
}

uint_t
page_get_shift(uint_t szc)
{
    if (szc >= mmu_page_sizes)
        panic("page_get_shift: out of range %d", szc);
    return (PAGE_GET_SHIFT(szc));
}

uint_t
page_get_pagecolors(uint_t szc)
{
    if (szc >= mmu_page_sizes)
        panic("page_get_pagecolors: out of range %d", szc);
    return (PAGE_GET_PAGECOLORS(szc));
}

/*
 * this assigns the desired equivalent color after a split
 */
uint_t
page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
    uint_t ncolor, uint_t ceq_mask)
{
    ASSERT(nszc > szc);
    ASSERT(szc < mmu_page_sizes);
    ASSERT(color < PAGE_GET_PAGECOLORS(szc));
    ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));

    color &= ceq_mask;
    ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
    return (color | (ncolor & ~ceq_mask));
}

/*
 * The interleaved_mnodes flag is set when mnodes overlap in
 * the physbase..physmax range, but have disjoint slices.
 * In this case hpm_counters is shared by all mnodes.
 * This flag is set dynamically by the platform.
 */
int interleaved_mnodes = 0;

/*
 * Called by startup().
 * Size up the per page size free list counters based on physmax
 * of each node and max_mem_nodes.
 *
 * If interleaved_mnodes is set we need to find the first mnode that
 * exists. hpm_counters for the first mnode will then be shared by
 * all other mnodes. If interleaved_mnodes is not set, just set
 * first=mnode each time. That means there will be no sharing.
 */
size_t
page_ctrs_sz(void)
{
    int r;      /* region size */
    int mnode;
    int firstmn;    /* first mnode that exists */
    int nranges;
    pfn_t   physbase;
    pfn_t   physmax;
    uint_t  ctrs_sz = 0;
    int     i;
    pgcnt_t colors_per_szc[MMU_PAGE_SIZES];

    /*
     * We need to determine how many page colors there are for each
     * page size in order to allocate memory for any color specific
     * arrays.
     */
    for (i = 0; i < mmu_page_sizes; i++) {
        colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
    }

    for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {

        pgcnt_t r_pgcnt;
        pfn_t   r_base;
        pgcnt_t r_align;

        if (mem_node_config[mnode].exists == 0)
            continue;

        HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
        nranges = MNODE_RANGE_CNT(mnode);
        mnode_nranges[mnode] = nranges;
        mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);

        /*
         * determine size needed for page counter arrays with
         * base aligned to large page size.
         */
        for (r = 1; r < mmu_page_sizes; r++) {
            /* add in space for hpm_color_current */
            ctrs_sz += sizeof (size_t) *
                colors_per_szc[r] * nranges;

            if (firstmn != mnode)
                continue;

            /* add in space for hpm_counters */
            r_align = page_get_pagecnt(r);
            r_base = physbase;
            r_base &= ~(r_align - 1);
            r_pgcnt = howmany(physmax - r_base + 1, r_align);

            /*
             * Round up to always allocate on pointer sized
             * boundaries.
             */
            ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
                sizeof (hpmctr_t *));
        }
    }

    for (r = 1; r < mmu_page_sizes; r++) {
        ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
    }

    /* add in space for page_ctrs_cands and pcc_color_free */
    ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
        mmu_page_sizes * NPC_MUTEX;

    for (mnode = 0; mnode < max_mem_nodes; mnode++) {

        if (mem_node_config[mnode].exists == 0)
            continue;

        nranges = mnode_nranges[mnode];
        ctrs_sz += sizeof (pcc_info_t) * nranges *
            mmu_page_sizes * NPC_MUTEX;
        for (r = 1; r < mmu_page_sizes; r++) {
            ctrs_sz += sizeof (pgcnt_t) * nranges *
                colors_per_szc[r] * NPC_MUTEX;
        }
    }

    /* ctr_mutex */
    ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));

    /* size for page list counts */
    PLCNT_SZ(ctrs_sz);

    /*
     * add some slop for roundups. page_ctrs_alloc will roundup the start
     * address of the counters to ecache_alignsize boundary for every
     * memory node.
     */
    return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
}

caddr_t
page_ctrs_alloc(caddr_t alloc_base)
{
    int mnode;
    int mrange, nranges;
    int r;      /* region size */
    int i;
    int firstmn;    /* first mnode that exists */
    pfn_t   physbase;
    pfn_t   physmax;
    pgcnt_t colors_per_szc[MMU_PAGE_SIZES];

    /*
     * We need to determine how many page colors there are for each
     * page size in order to allocate memory for any color specific
     * arrays.
     */
    for (i = 0; i < mmu_page_sizes; i++) {
        colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
    }

    for (r = 1; r < mmu_page_sizes; r++) {
        page_counters[r] = (hw_page_map_t *)alloc_base;
        alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
    }

    /* page_ctrs_cands and pcc_color_free array */
    for (i = 0; i < NPC_MUTEX; i++) {
        for (r = 1; r < mmu_page_sizes; r++) {

            page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
            alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;

            for (mnode = 0; mnode < max_mem_nodes; mnode++) {
                pcc_info_t *pi;

                if (mem_node_config[mnode].exists == 0)
                    continue;

                nranges = mnode_nranges[mnode];

                pi = (pcc_info_t *)alloc_base;
                alloc_base += sizeof (pcc_info_t) * nranges;
                page_ctrs_cands[i][r][mnode] = pi;

                for (mrange = 0; mrange < nranges; mrange++) {
                    pi->pcc_color_free =
                        (pgcnt_t *)alloc_base;
                    alloc_base += sizeof (pgcnt_t) *
                        colors_per_szc[r];
                    pi++;
                }
            }
        }
    }

    /* ctr_mutex */
    for (i = 0; i < NPC_MUTEX; i++) {
        ctr_mutex[i] = (kmutex_t *)alloc_base;
        alloc_base += (max_mem_nodes * sizeof (kmutex_t));
    }

    /* initialize page list counts */
    PLCNT_INIT(alloc_base);

    for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {

        pgcnt_t r_pgcnt;
        pfn_t   r_base;
        pgcnt_t r_align;
        int r_shift;
        int nranges = mnode_nranges[mnode];

        if (mem_node_config[mnode].exists == 0)
            continue;

        HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);

        for (r = 1; r < mmu_page_sizes; r++) {
            /*
             * the page_counters base has to be aligned to the
             * page count of page size code r otherwise the counts
             * will cross large page boundaries.
             */
            r_align = page_get_pagecnt(r);
            r_base = physbase;
            /* base needs to be aligned - lower to aligned value */
            r_base &= ~(r_align - 1);
            r_pgcnt = howmany(physmax - r_base + 1, r_align);
            r_shift = PAGE_BSZS_SHIFT(r);

            PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
            PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
            PAGE_COUNTERS_BASE(mnode, r) = r_base;
            for (mrange = 0; mrange < nranges; mrange++) {
                PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
                    r, mrange) = (size_t *)alloc_base;
                alloc_base += sizeof (size_t) *
                    colors_per_szc[r];
            }
            for (i = 0; i < colors_per_szc[r]; i++) {
                uint_t color_mask = colors_per_szc[r] - 1;
                pfn_t  pfnum = r_base;
                size_t idx;
                int mrange;
                MEM_NODE_ITERATOR_DECL(it);

                MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
                ASSERT(pfnum != (pfn_t)-1);
                PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
                    color_mask, color_mask, &it);
                idx = PNUM_TO_IDX(mnode, r, pfnum);
                idx = (idx >= r_pgcnt) ? 0 : idx;
                for (mrange = 0; mrange < nranges; mrange++) {
                    PAGE_COUNTERS_CURRENT_COLOR(mnode,
                        r, i, mrange) = idx;
                }
            }

            /* hpm_counters may be shared by all mnodes */
            if (firstmn == mnode) {
                PAGE_COUNTERS_COUNTERS(mnode, r) =
                    (hpmctr_t *)alloc_base;
                alloc_base +=
                    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
                    sizeof (hpmctr_t *));
            } else {
                PAGE_COUNTERS_COUNTERS(mnode, r) =
                    PAGE_COUNTERS_COUNTERS(firstmn, r);
            }

            /*
             * Verify that PNUM_TO_IDX and IDX_TO_PNUM
             * satisfy the identity requirement.
             * We should be able to go from one to the other
             * and get consistent values.
             */
            ASSERT(PNUM_TO_IDX(mnode, r,
                (IDX_TO_PNUM(mnode, r, 0))) == 0);
            ASSERT(IDX_TO_PNUM(mnode, r,
                (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
        }
        /*
         * Roundup the start address of the page_counters to
         * cache aligned boundary for every memory node.
         * page_ctrs_sz() has added some slop for these roundups.
         */
        alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
            L2CACHE_ALIGN);
    }

    /* Initialize other page counter specific data structures. */
    for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
        rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
    }

    return (alloc_base);
}

/*
 * Functions to adjust region counters for each size free list.
 * Caller is responsible to acquire the ctr_mutex lock if necessary and
 * thus can be called during startup without locks.
 */
/* ARGSUSED */
void
page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
{
    ssize_t     r;  /* region size */
    ssize_t     idx;
    pfn_t       pfnum;
    int     lckidx;

    ASSERT(mnode == PP_2_MEM_NODE(pp));
    ASSERT(mtype == PP_2_MTYPE(pp));

    ASSERT(pp->p_szc < mmu_page_sizes);

    PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);

    /* no counter update needed for largest page size */
    if (pp->p_szc >= mmu_page_sizes - 1) {
        return;
    }

    r = pp->p_szc + 1;
    pfnum = pp->p_pagenum;
    lckidx = PP_CTR_LOCK_INDX(pp);

    /*
     * Increment the count of free pages for the current
     * region. Continue looping up in region size incrementing
     * count if the preceeding region is full.
     */
    while (r < mmu_page_sizes) {
        idx = PNUM_TO_IDX(mnode, r, pfnum);

        ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
        ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));

        if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
            break;
        } else {
            int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
            pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
                [MTYPE_2_MRANGE(mnode, root_mtype)];

            cand->pcc_pages_free++;
            cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
        }
        r++;
    }
}

void
page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
{
    int     lckidx = PP_CTR_LOCK_INDX(pp);
    kmutex_t    *lock = &ctr_mutex[lckidx][mnode];

    mutex_enter(lock);
    page_ctr_add_internal(mnode, mtype, pp, flags);
    mutex_exit(lock);
}

void
page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
{
    int     lckidx;
    ssize_t     r;  /* region size */
    ssize_t     idx;
    pfn_t       pfnum;

    ASSERT(mnode == PP_2_MEM_NODE(pp));
    ASSERT(mtype == PP_2_MTYPE(pp));

    ASSERT(pp->p_szc < mmu_page_sizes);

    PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);

    /* no counter update needed for largest page size */
    if (pp->p_szc >= mmu_page_sizes - 1) {
        return;
    }

    r = pp->p_szc + 1;
    pfnum = pp->p_pagenum;
    lckidx = PP_CTR_LOCK_INDX(pp);

    /*
     * Decrement the count of free pages for the current
     * region. Continue looping up in region size decrementing
     * count if the preceeding region was full.
     */
    while (r < mmu_page_sizes) {
        idx = PNUM_TO_IDX(mnode, r, pfnum);

        ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
        ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);

        if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
            break;
        } else {
            int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
            pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
                [MTYPE_2_MRANGE(mnode, root_mtype)];

            ASSERT(cand->pcc_pages_free != 0);
            ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);

            cand->pcc_pages_free--;
            cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
        }
        r++;
    }
}

void
page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
{
    int     lckidx = PP_CTR_LOCK_INDX(pp);
    kmutex_t    *lock = &ctr_mutex[lckidx][mnode];

    mutex_enter(lock);
    page_ctr_sub_internal(mnode, mtype, pp, flags);
    mutex_exit(lock);
}

/*
 * Adjust page counters following a memory attach, since typically the
 * size of the array needs to change, and the PFN to counter index
 * mapping needs to change.
 *
 * It is possible this mnode did not exist at startup. In that case
 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
 * to change (a theoretical possibility on x86), which means pcc_color_free
 * arrays must be extended.
 */
uint_t
page_ctrs_adjust(int mnode)
{
    pgcnt_t npgs;
    int r;      /* region size */
    int i;
    size_t  pcsz, old_csz;
    hpmctr_t *new_ctr, *old_ctr;
    pfn_t   oldbase, newbase;
    pfn_t   physbase, physmax;
    size_t  old_npgs;
    hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
    size_t  size_cache[MMU_PAGE_SIZES];
    size_t  *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
    size_t  *old_color_array[MAX_MNODE_MRANGES];
    pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
    pcc_info_t **cands_cache;
    pcc_info_t *old_pi, *pi;
    pgcnt_t *pgcntp;
    int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
    int cands_cache_nranges;
    int old_maxmrange, new_maxmrange;
    int rc = 0;

    cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
        MMU_PAGE_SIZES, KM_NOSLEEP);
    if (cands_cache == NULL)
        return (ENOMEM);

    i = -1;
    HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);

    newbase = physbase & ~PC_BASE_ALIGN_MASK;
    npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;

    /* prepare to free non-null pointers on the way out */
    cands_cache_nranges = nranges;
    bzero(ctr_cache, sizeof (ctr_cache));
    bzero(color_cache, sizeof (color_cache));

    /*
     * We need to determine how many page colors there are for each
     * page size in order to allocate memory for any color specific
     * arrays.
     */
    for (r = 0; r < mmu_page_sizes; r++) {
        colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
    }

    /*
     * Preallocate all of the new hpm_counters arrays as we can't
     * hold the page_ctrs_rwlock as a writer and allocate memory.
     * If we can't allocate all of the arrays, undo our work so far
     * and return failure.
     */
    for (r = 1; r < mmu_page_sizes; r++) {
        pcsz = npgs >> PAGE_BSZS_SHIFT(r);
        size_cache[r] = pcsz;
        ctr_cache[r] = kmem_zalloc(pcsz *
            sizeof (hpmctr_t), KM_NOSLEEP);
        if (ctr_cache[r] == NULL) {
            rc = ENOMEM;
            goto cleanup;
        }
    }

    /*
     * Preallocate all of the new color current arrays as we can't
     * hold the page_ctrs_rwlock as a writer and allocate memory.
     * If we can't allocate all of the arrays, undo our work so far
     * and return failure.
     */
    for (r = 1; r < mmu_page_sizes; r++) {
        for (mrange = 0; mrange < nranges; mrange++) {
            color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
                colors_per_szc[r], KM_NOSLEEP);
            if (color_cache[r][mrange] == NULL) {
                rc = ENOMEM;
                goto cleanup;
            }
        }
    }

    /*
     * Preallocate all of the new pcc_info_t arrays as we can't
     * hold the page_ctrs_rwlock as a writer and allocate memory.
     * If we can't allocate all of the arrays, undo our work so far
     * and return failure.
     */
    for (r = 1; r < mmu_page_sizes; r++) {
        for (i = 0; i < NPC_MUTEX; i++) {
            pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
                KM_NOSLEEP);
            if (pi == NULL) {
                rc = ENOMEM;
                goto cleanup;
            }
            cands_cache[i * MMU_PAGE_SIZES + r] = pi;

            for (mrange = 0; mrange < nranges; mrange++, pi++) {
                pgcntp = kmem_zalloc(colors_per_szc[r] *
                    sizeof (pgcnt_t), KM_NOSLEEP);
                if (pgcntp == NULL) {
                    rc = ENOMEM;
                    goto cleanup;
                }
                pi->pcc_color_free = pgcntp;
            }
        }
    }

    /*
     * Grab the write lock to prevent others from walking these arrays
     * while we are modifying them.
     */
    PAGE_CTRS_WRITE_LOCK(mnode);

    old_nranges = mnode_nranges[mnode];
    cands_cache_nranges = old_nranges;
    mnode_nranges[mnode] = nranges;
    old_maxmrange = mnode_maxmrange[mnode];
    mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
    new_maxmrange = mnode_maxmrange[mnode];

    for (r = 1; r < mmu_page_sizes; r++) {
        PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
        old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
        old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
        oldbase = PAGE_COUNTERS_BASE(mnode, r);
        old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
        for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
            old_color_array[mrange] =
                PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
                r, mrange);
        }

        pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
        new_ctr = ctr_cache[r];
        ctr_cache[r] = NULL;
        if (old_ctr != NULL &&
            (oldbase + old_npgs > newbase) &&
            (newbase + npgs > oldbase)) {
            /*
             * Map the intersection of the old and new
             * counters into the new array.
             */
            size_t offset;
            if (newbase > oldbase) {
                offset = (newbase - oldbase) >>
                    PAGE_COUNTERS_SHIFT(mnode, r);
                bcopy(old_ctr + offset, new_ctr,
                    MIN(pcsz, (old_csz - offset)) *
                    sizeof (hpmctr_t));
            } else {
                offset = (oldbase - newbase) >>
                    PAGE_COUNTERS_SHIFT(mnode, r);
                bcopy(old_ctr, new_ctr + offset,
                    MIN(pcsz - offset, old_csz) *
                    sizeof (hpmctr_t));
            }
        }

        PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
        PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
        PAGE_COUNTERS_BASE(mnode, r) = newbase;

        /* update shared hpm_counters in other mnodes */
        if (interleaved_mnodes) {
            for (i = 0; i < max_mem_nodes; i++) {
                if (i == mnode)
                    continue;
                if (mem_node_config[i].exists == 0)
                    continue;
                ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr);
                PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
                PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
                PAGE_COUNTERS_BASE(i, r) = newbase;
            }
        }

        for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
            PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
                color_cache[r][mrange];
            color_cache[r][mrange] = NULL;
        }
        /*
         * for now, just reset on these events as it's probably
         * not worthwhile to try and optimize this.
         */
        for (i = 0; i < colors_per_szc[r]; i++) {
            uint_t color_mask = colors_per_szc[r] - 1;
            int mlo = interleaved_mnodes ? 0 : mnode;
            int mhi = interleaved_mnodes ? max_mem_nodes :
                (mnode + 1);
            int m;
            pfn_t  pfnum = newbase;
            size_t idx;
            MEM_NODE_ITERATOR_DECL(it);

            for (m = mlo; m < mhi; m++) {
                if (mem_node_config[m].exists == 0)
                    continue;
                MEM_NODE_ITERATOR_INIT(pfnum, m, &it);
                ASSERT(pfnum != (pfn_t)-1);
                PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
                    color_mask, &it);
                idx = PNUM_TO_IDX(m, r, pfnum);
                idx = (idx < pcsz) ? idx : 0;
                for (mrange = 0; mrange < nranges; mrange++) {
                    PAGE_COUNTERS_CURRENT_COLOR(m,
                        r, i, mrange) = idx;
                }
            }
        }

        /* cache info for freeing out of the critical path */
        if ((caddr_t)old_ctr >= kernelheap &&
            (caddr_t)old_ctr < ekernelheap) {
            ctr_cache[r] = old_ctr;
            size_cache[r] = old_csz;
        }
        for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
            size_t *tmp = old_color_array[mrange];
            if ((caddr_t)tmp >= kernelheap &&
                (caddr_t)tmp < ekernelheap) {
                color_cache[r][mrange] = tmp;
            }
        }
        /*
         * Verify that PNUM_TO_IDX and IDX_TO_PNUM
         * satisfy the identity requirement.
         * We should be able to go from one to the other
         * and get consistent values.
         */
        ASSERT(PNUM_TO_IDX(mnode, r,
            (IDX_TO_PNUM(mnode, r, 0))) == 0);
        ASSERT(IDX_TO_PNUM(mnode, r,
            (PNUM_TO_IDX(mnode, r, newbase))) == newbase);

        /* pcc_info_t and pcc_color_free */
        for (i = 0; i < NPC_MUTEX; i++) {
            pcc_info_t *epi;
            pcc_info_t *eold_pi;

            pi = cands_cache[i * MMU_PAGE_SIZES + r];
            old_pi = page_ctrs_cands[i][r][mnode];
            page_ctrs_cands[i][r][mnode] = pi;
            cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;

            /* preserve old pcc_color_free values, if any */
            if (old_pi == NULL)
                continue;

            /*
             * when/if x86 does DR, must account for
             * possible change in range index when
             * preserving pcc_info
             */
            epi = &pi[nranges];
            eold_pi = &old_pi[old_nranges];
            if (new_maxmrange > old_maxmrange) {
                pi += new_maxmrange - old_maxmrange;
            } else if (new_maxmrange < old_maxmrange) {
                old_pi += old_maxmrange - new_maxmrange;
            }
            for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
                pcc_info_t tmp = *pi;
                *pi = *old_pi;
                *old_pi = tmp;
            }
        }
    }
    PAGE_CTRS_WRITE_UNLOCK(mnode);

    /*
     * Now that we have dropped the write lock, it is safe to free all
     * of the memory we have cached above.
     * We come thru here to free memory when pre-alloc fails, and also to
     * free old pointers which were recorded while locked.
     */
cleanup:
    for (r = 1; r < mmu_page_sizes; r++) {
        if (ctr_cache[r] != NULL) {
            kmem_free(ctr_cache[r],
                size_cache[r] * sizeof (hpmctr_t));
        }
        for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
            if (color_cache[r][mrange] != NULL) {
                kmem_free(color_cache[r][mrange],
                    colors_per_szc[r] * sizeof (size_t));
            }
        }
        for (i = 0; i < NPC_MUTEX; i++) {
            pi = cands_cache[i * MMU_PAGE_SIZES + r];
            if (pi == NULL)
                continue;
            nr = cands_cache_nranges;
            for (mrange = 0; mrange < nr; mrange++, pi++) {
                pgcntp = pi->pcc_color_free;
                if (pgcntp == NULL)
                    continue;
                if ((caddr_t)pgcntp >= kernelheap &&
                    (caddr_t)pgcntp < ekernelheap) {
                    kmem_free(pgcntp,
                        colors_per_szc[r] *
                        sizeof (pgcnt_t));
                }
            }
            pi = cands_cache[i * MMU_PAGE_SIZES + r];
            if ((caddr_t)pi >= kernelheap &&
                (caddr_t)pi < ekernelheap) {
                kmem_free(pi, nr * sizeof (pcc_info_t));
            }
        }
    }

    kmem_free(cands_cache,
        sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
    return (rc);
}


#ifdef DEBUG

/*
 * confirm pp is a large page corresponding to szc
 */
void
chk_lpg(page_t *pp, uchar_t szc)
{
    spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
    uint_t noreloc;

    if (npgs == 1) {
        ASSERT(pp->p_szc == 0);
        ASSERT(pp->p_next == pp);
        ASSERT(pp->p_prev == pp);
        return;
    }

    ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
    ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);

    ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
    ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
    ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
    ASSERT(pp->p_prev == (pp + (npgs - 1)));

    /*
     * Check list of pages.
     */
    noreloc = PP_ISNORELOC(pp);
    while (npgs--) {
        if (npgs != 0) {
            ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
            ASSERT(pp->p_next == (pp + 1));
        }
        ASSERT(pp->p_szc == szc);
        ASSERT(PP_ISFREE(pp));
        ASSERT(PP_ISAGED(pp));
        ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
        ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
        ASSERT(pp->p_vnode  == NULL);
        ASSERT(PP_ISNORELOC(pp) == noreloc);

        pp = pp->p_next;
    }
}
#endif /* DEBUG */

void
page_freelist_lock(int mnode)
{
    int i;
    for (i = 0; i < NPC_MUTEX; i++) {
        mutex_enter(FPC_MUTEX(mnode, i));
        mutex_enter(CPC_MUTEX(mnode, i));
    }
}

void
page_freelist_unlock(int mnode)
{
    int i;
    for (i = 0; i < NPC_MUTEX; i++) {
        mutex_exit(FPC_MUTEX(mnode, i));
        mutex_exit(CPC_MUTEX(mnode, i));
    }
}

/*
 * add pp to the specified page list. Defaults to head of the page list
 * unless PG_LIST_TAIL is specified.
 */
void
page_list_add(page_t *pp, int flags)
{
    page_t      **ppp;
    kmutex_t    *pcm;
    uint_t      bin, mtype;
    int     mnode;

    ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
    ASSERT(PP_ISFREE(pp));
    ASSERT(!hat_page_is_mapped(pp));
    ASSERT(hat_page_getshare(pp) == 0);

    /*
     * Large pages should be freed via page_list_add_pages().
     */
    ASSERT(pp->p_szc == 0);

    /*
     * Don't need to lock the freelist first here
     * because the page isn't on the freelist yet.
     * This means p_szc can't change on us.
     */

    bin = PP_2_BIN(pp);
    mnode = PP_2_MEM_NODE(pp);
    mtype = PP_2_MTYPE(pp);

    if (flags & PG_LIST_ISINIT) {
        /*
         * PG_LIST_ISINIT is set during system startup (ie. single
         * threaded), add a page to the free list and add to the
         * the free region counters w/o any locking
         */
        ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);

        /* inline version of page_add() */
        if (*ppp != NULL) {
            pp->p_next = *ppp;
            pp->p_prev = (*ppp)->p_prev;
            (*ppp)->p_prev = pp;
            pp->p_prev->p_next = pp;
        } else
            *ppp = pp;

        page_ctr_add_internal(mnode, mtype, pp, flags);
        VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
    } else {
        pcm = PC_BIN_MUTEX(mnode, bin, flags);

        if (flags & PG_FREE_LIST) {
            VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
            ASSERT(PP_ISAGED(pp));
            ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);

        } else {
            VM_STAT_ADD(vmm_vmstats.pladd_cache);
            ASSERT(pp->p_vnode);
            ASSERT((pp->p_offset & PAGEOFFSET) == 0);
            ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
        }
        mutex_enter(pcm);
        page_add(ppp, pp);

        if (flags & PG_LIST_TAIL)
            *ppp = (*ppp)->p_next;
        /*
         * Add counters before releasing pcm mutex to avoid a race with
         * page_freelist_coalesce and page_freelist_split.
         */
        page_ctr_add(mnode, mtype, pp, flags);
        mutex_exit(pcm);
    }


#if defined(__sparc)
    if (PP_ISNORELOC(pp)) {
        kcage_freemem_add(1);
    }
#endif
    /*
     * It is up to the caller to unlock the page!
     */
    ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
}


#ifdef __sparc
/*
 * This routine is only used by kcage_init during system startup.
 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
 * without the overhead of taking locks and updating counters.
 */
void
page_list_noreloc_startup(page_t *pp)
{
    page_t      **ppp;
    uint_t      bin;
    int     mnode;
    int     mtype;
    int     flags = 0;

    /*
     * If this is a large page on the freelist then
     * break it up into smaller pages.
     */
    if (pp->p_szc != 0)
        page_boot_demote(pp);

    /*
     * Get list page is currently on.
     */
    bin = PP_2_BIN(pp);
    mnode = PP_2_MEM_NODE(pp);
    mtype = PP_2_MTYPE(pp);
    ASSERT(mtype == MTYPE_RELOC);
    ASSERT(pp->p_szc == 0);

    if (PP_ISAGED(pp)) {
        ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
        flags |= PG_FREE_LIST;
    } else {
        ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
        flags |= PG_CACHE_LIST;
    }

    ASSERT(*ppp != NULL);

    /*
     * Delete page from current list.
     */
    if (*ppp == pp)
        *ppp = pp->p_next;      /* go to next page */
    if (*ppp == pp) {
        *ppp = NULL;            /* page list is gone */
    } else {
        pp->p_prev->p_next = pp->p_next;
        pp->p_next->p_prev = pp->p_prev;
    }

    /*
     * Decrement page counters
     */
    page_ctr_sub_internal(mnode, mtype, pp, flags);

    /*
     * Set no reloc for cage initted pages.
     */
    PP_SETNORELOC(pp);

    mtype = PP_2_MTYPE(pp);
    ASSERT(mtype == MTYPE_NORELOC);

    /*
     * Get new list for page.
     */
    if (PP_ISAGED(pp)) {
        ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
    } else {
        ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
    }

    /*
     * Insert page on new list.
     */
    if (*ppp == NULL) {
        *ppp = pp;
        pp->p_next = pp->p_prev = pp;
    } else {
        pp->p_next = *ppp;
        pp->p_prev = (*ppp)->p_prev;
        (*ppp)->p_prev = pp;
        pp->p_prev->p_next = pp;
    }

    /*
     * Increment page counters
     */
    page_ctr_add_internal(mnode, mtype, pp, flags);

    /*
     * Update cage freemem counter
     */
    atomic_add_long(&kcage_freemem, 1);
}
#else   /* __sparc */

/* ARGSUSED */
void
page_list_noreloc_startup(page_t *pp)
{
    panic("page_list_noreloc_startup: should be here only for sparc");
}
#endif

void
page_list_add_pages(page_t *pp, int flags)
{
    kmutex_t *pcm;
    pgcnt_t pgcnt;
    uint_t  bin, mtype, i;
    int mnode;

    /* default to freelist/head */
    ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);

    CHK_LPG(pp, pp->p_szc);
    VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);

    bin = PP_2_BIN(pp);
    mnode = PP_2_MEM_NODE(pp);
    mtype = PP_2_MTYPE(pp);

    if (flags & PG_LIST_ISINIT) {
        ASSERT(pp->p_szc == mmu_page_sizes - 1);
        page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
        ASSERT(!PP_ISNORELOC(pp));
        PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
    } else {

        ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);

        pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);

        mutex_enter(pcm);
        page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
        page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
        mutex_exit(pcm);

        pgcnt = page_get_pagecnt(pp->p_szc);
#if defined(__sparc)
        if (PP_ISNORELOC(pp))
            kcage_freemem_add(pgcnt);
#endif
        for (i = 0; i < pgcnt; i++, pp++)
            page_unlock_nocapture(pp);
    }
}

/*
 * During boot, need to demote a large page to base
 * pagesize pages for seg_kmem for use in boot_alloc()
 */
void
page_boot_demote(page_t *pp)
{
    ASSERT(pp->p_szc != 0);
    ASSERT(PP_ISFREE(pp));
    ASSERT(PP_ISAGED(pp));

    (void) page_demote(PP_2_MEM_NODE(pp),
        PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
        PC_FREE);

    ASSERT(PP_ISFREE(pp));
    ASSERT(PP_ISAGED(pp));
    ASSERT(pp->p_szc == 0);
}

/*
 * Take a particular page off of whatever freelist the page
 * is claimed to be on.
 *
 * NOTE: Only used for PAGESIZE pages.
 */
void
page_list_sub(page_t *pp, int flags)
{
    int     bin;
    uint_t      mtype;
    int     mnode;
    kmutex_t    *pcm;
    page_t      **ppp;

    ASSERT(PAGE_EXCL(pp));
    ASSERT(PP_ISFREE(pp));

    /*
     * The p_szc field can only be changed by page_promote()
     * and page_demote(). Only free pages can be promoted and
     * demoted and the free list MUST be locked during these
     * operations. So to prevent a race in page_list_sub()
     * between computing which bin of the freelist lock to
     * grab and actually grabing the lock we check again that
     * the bin we locked is still the correct one. Notice that
     * the p_szc field could have actually changed on us but
     * if the bin happens to still be the same we are safe.
     */
try_again:
    bin = PP_2_BIN(pp);
    mnode = PP_2_MEM_NODE(pp);
    pcm = PC_BIN_MUTEX(mnode, bin, flags);
    mutex_enter(pcm);
    if (PP_2_BIN(pp) != bin) {
        mutex_exit(pcm);
        goto try_again;
    }
    mtype = PP_2_MTYPE(pp);

    if (flags & PG_FREE_LIST) {
        VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
        ASSERT(PP_ISAGED(pp));
        ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
    } else {
        VM_STAT_ADD(vmm_vmstats.plsub_cache);
        ASSERT(!PP_ISAGED(pp));
        ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
    }

    /*
     * Common PAGESIZE case.
     *
     * Note that we locked the freelist. This prevents
     * any page promotion/demotion operations. Therefore
     * the p_szc will not change until we drop pcm mutex.
     */
    if (pp->p_szc == 0) {
        page_sub(ppp, pp);
        /*
         * Subtract counters before releasing pcm mutex
         * to avoid race with page_freelist_coalesce.
         */
        page_ctr_sub(mnode, mtype, pp, flags);
        mutex_exit(pcm);

#if defined(__sparc)
        if (PP_ISNORELOC(pp)) {
            kcage_freemem_sub(1);
        }
#endif
        return;
    }

    /*
     * Large pages on the cache list are not supported.
     */
    if (flags & PG_CACHE_LIST)
        panic("page_list_sub: large page on cachelist");

    /*
     * Slow but rare.
     *
     * Somebody wants this particular page which is part
     * of a large page. In this case we just demote the page
     * if it's on the freelist.
     *
     * We have to drop pcm before locking the entire freelist.
     * Once we have re-locked the freelist check to make sure
     * the page hasn't already been demoted or completely
     * freed.
     */
    mutex_exit(pcm);
    page_freelist_lock(mnode);
    if (pp->p_szc != 0) {
        /*
         * Large page is on freelist.
         */
        (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
            pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
    }
    ASSERT(PP_ISFREE(pp));
    ASSERT(PP_ISAGED(pp));
    ASSERT(pp->p_szc == 0);

    /*
     * Subtract counters before releasing pcm mutex
     * to avoid race with page_freelist_coalesce.
     */
    bin = PP_2_BIN(pp);
    mtype = PP_2_MTYPE(pp);
    ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);

    page_sub(ppp, pp);
    page_ctr_sub(mnode, mtype, pp, flags);
    page_freelist_unlock(mnode);

#if defined(__sparc)
    if (PP_ISNORELOC(pp)) {
        kcage_freemem_sub(1);
    }
#endif
}

void
page_list_sub_pages(page_t *pp, uint_t szc)
{
    kmutex_t *pcm;
    uint_t  bin, mtype;
    int mnode;

    ASSERT(PAGE_EXCL(pp));
    ASSERT(PP_ISFREE(pp));
    ASSERT(PP_ISAGED(pp));

    /*
     * See comment in page_list_sub().
     */
try_again:
    bin = PP_2_BIN(pp);
    mnode = PP_2_MEM_NODE(pp);
    pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
    mutex_enter(pcm);
    if (PP_2_BIN(pp) != bin) {
        mutex_exit(pcm);
        goto    try_again;
    }

    /*
     * If we're called with a page larger than szc or it got
     * promoted above szc before we locked the freelist then
     * drop pcm and re-lock entire freelist. If page still larger
     * than szc then demote it.
     */
    if (pp->p_szc > szc) {
        mutex_exit(pcm);
        pcm = NULL;
        page_freelist_lock(mnode);
        if (pp->p_szc > szc) {
            VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
            (void) page_demote(mnode,
                PFN_BASE(pp->p_pagenum, pp->p_szc),
                pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
        }
        bin = PP_2_BIN(pp);
    }
    ASSERT(PP_ISFREE(pp));
    ASSERT(PP_ISAGED(pp));
    ASSERT(pp->p_szc <= szc);
    ASSERT(pp == PP_PAGEROOT(pp));

    VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);

    mtype = PP_2_MTYPE(pp);
    if (pp->p_szc != 0) {
        page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
        CHK_LPG(pp, pp->p_szc);
    } else {
        VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
        page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
    }
    page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);

    if (pcm != NULL) {
        mutex_exit(pcm);
    } else {
        page_freelist_unlock(mnode);
    }

#if defined(__sparc)
    if (PP_ISNORELOC(pp)) {
        pgcnt_t pgcnt;

        pgcnt = page_get_pagecnt(pp->p_szc);
        kcage_freemem_sub(pgcnt);
    }
#endif
}

/*
 * Add the page to the front of a linked list of pages
 * using the p_next & p_prev pointers for the list.
 * The caller is responsible for protecting the list pointers.
 */
void
mach_page_add(page_t **ppp, page_t *pp)
{
    if (*ppp == NULL) {
        pp->p_next = pp->p_prev = pp;
    } else {
        pp->p_next = *ppp;
        pp->p_prev = (*ppp)->p_prev;
        (*ppp)->p_prev = pp;
        pp->p_prev->p_next = pp;
    }
    *ppp = pp;
}

/*
 * Remove this page from a linked list of pages
 * using the p_next & p_prev pointers for the list.
 *
 * The caller is responsible for protecting the list pointers.
 */
void
mach_page_sub(page_t **ppp, page_t *pp)
{
    ASSERT(PP_ISFREE(pp));

    if (*ppp == NULL || pp == NULL)
        panic("mach_page_sub");

    if (*ppp == pp)
        *ppp = pp->p_next;      /* go to next page */

    if (*ppp == pp)
        *ppp = NULL;            /* page list is gone */
    else {
        pp->p_prev->p_next = pp->p_next;
        pp->p_next->p_prev = pp->p_prev;
    }
    pp->p_prev = pp->p_next = pp;       /* make pp a list of one */
}

/*
 * Routine fsflush uses to gradually coalesce the free list into larger pages.
 */
void
page_promote_size(page_t *pp, uint_t cur_szc)
{
    pfn_t pfn;
    int mnode;
    int idx;
    int new_szc = cur_szc + 1;
    int full = FULL_REGION_CNT(new_szc);

    pfn = page_pptonum(pp);
    mnode = PFN_2_MEM_NODE(pfn);

    page_freelist_lock(mnode);

    idx = PNUM_TO_IDX(mnode, new_szc, pfn);
    if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
        (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);

    page_freelist_unlock(mnode);
}

static uint_t page_promote_err;
static uint_t page_promote_noreloc_err;

/*
 * Create a single larger page (of szc new_szc) from smaller contiguous pages
 * for the given mnode starting at pfnum. Pages involved are on the freelist
 * before the call and may be returned to the caller if requested, otherwise
 * they will be placed back on the freelist.
 * If flags is PC_ALLOC, then the large page will be returned to the user in
 * a state which is consistent with a page being taken off the freelist.  If
 * we failed to lock the new large page, then we will return NULL to the
 * caller and put the large page on the freelist instead.
 * If flags is PC_FREE, then the large page will be placed on the freelist,
 * and NULL will be returned.
 * The caller is responsible for locking the freelist as well as any other
 * accounting which needs to be done for a returned page.
 *
 * RFE: For performance pass in pp instead of pfnum so
 *  we can avoid excessive calls to page_numtopp_nolock().
 *  This would depend on an assumption that all contiguous
 *  pages are in the same memseg so we can just add/dec
 *  our pp.
 *
 * Lock ordering:
 *
 *  There is a potential but rare deadlock situation
 *  for page promotion and demotion operations. The problem
 *  is there are two paths into the freelist manager and
 *  they have different lock orders:
 *
 *  page_create()
 *      lock freelist
 *      page_lock(EXCL)
 *      unlock freelist
 *      return
 *      caller drops page_lock
 *
 *  page_free() and page_reclaim()
 *      caller grabs page_lock(EXCL)
 *
 *      lock freelist
 *      unlock freelist
 *      drop page_lock
 *
 *  What prevents a thread in page_create() from deadlocking
 *  with a thread freeing or reclaiming the same page is the
 *  page_trylock() in page_get_freelist(). If the trylock fails
 *  it skips the page.
 *
 *  The lock ordering for promotion and demotion is the same as
 *  for page_create(). Since the same deadlock could occur during
 *  page promotion and freeing or reclaiming of a page on the
 *  cache list we might have to fail the operation and undo what
 *  have done so far. Again this is rare.
 */
page_t *
page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
{
    page_t      *pp, *pplist, *tpp, *start_pp;
    pgcnt_t     new_npgs, npgs;
    uint_t      bin;
    pgcnt_t     tmpnpgs, pages_left;
    uint_t      noreloc;
    int         which_list;
    ulong_t     index;
    kmutex_t    *phm;

    /*
     * General algorithm:
     * Find the starting page
     * Walk each page struct removing it from the freelist,
     * and linking it to all the other pages removed.
     * Once all pages are off the freelist,
     * walk the list, modifying p_szc to new_szc and what
     * ever other info needs to be done to create a large free page.
     * According to the flags, either return the page or put it
     * on the freelist.
     */

    start_pp = page_numtopp_nolock(pfnum);
    ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
    new_npgs = page_get_pagecnt(new_szc);
    ASSERT(IS_P2ALIGNED(pfnum, new_npgs));

    /* don't return page of the wrong mtype */
    if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
            return (NULL);

    /*
     * Loop through smaller pages to confirm that all pages
     * give the same result for PP_ISNORELOC().
     * We can check this reliably here as the protocol for setting
     * P_NORELOC requires pages to be taken off the free list first.
     */
    noreloc = PP_ISNORELOC(start_pp);
    for (pp = start_pp + new_npgs; --pp > start_pp; ) {
        if (noreloc != PP_ISNORELOC(pp)) {
            page_promote_noreloc_err++;
            page_promote_err++;
            return (NULL);
        }
    }

    pages_left = new_npgs;
    pplist = NULL;
    pp = start_pp;

    /* Loop around coalescing the smaller pages into a big page. */
    while (pages_left) {
        /*
         * Remove from the freelist.
         */
        ASSERT(PP_ISFREE(pp));
        bin = PP_2_BIN(pp);
        ASSERT(mnode == PP_2_MEM_NODE(pp));
        mtype = PP_2_MTYPE(pp);
        if (PP_ISAGED(pp)) {

            /*
             * PG_FREE_LIST
             */
            if (pp->p_szc) {
                page_vpsub(&PAGE_FREELISTS(mnode,
                    pp->p_szc, bin, mtype), pp);
            } else {
                mach_page_sub(&PAGE_FREELISTS(mnode, 0,
                    bin, mtype), pp);
            }
            which_list = PG_FREE_LIST;
        } else {
            ASSERT(pp->p_szc == 0);

            /*
             * PG_CACHE_LIST
             *
             * Since this page comes from the
             * cachelist, we must destroy the
             * vnode association.
             */
            if (!page_trylock(pp, SE_EXCL)) {
                goto fail_promote;
            }

            /*
             * We need to be careful not to deadlock
             * with another thread in page_lookup().
             * The page_lookup() thread could be holding
             * the same phm that we need if the two
             * pages happen to hash to the same phm lock.
             * At this point we have locked the entire
             * freelist and page_lookup() could be trying
             * to grab a freelist lock.
             */
            index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
            phm = PAGE_HASH_MUTEX(index);
            if (!mutex_tryenter(phm)) {
                page_unlock_nocapture(pp);
                goto fail_promote;
            }

            mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
            page_hashout(pp, phm);
            mutex_exit(phm);
            PP_SETAGED(pp);
            page_unlock_nocapture(pp);
            which_list = PG_CACHE_LIST;
        }
        page_ctr_sub(mnode, mtype, pp, which_list);

        /*
         * Concatenate the smaller page(s) onto
         * the large page list.
         */
        tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
        pages_left -= npgs;
        tpp = pp;
        while (npgs--) {
            tpp->p_szc = new_szc;
            tpp = tpp->p_next;
        }
        page_list_concat(&pplist, &pp);
        pp += tmpnpgs;
    }
    CHK_LPG(pplist, new_szc);

    /*
     * return the page to the user if requested
     * in the properly locked state.
     */
    if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
        return (pplist);
    }

    /*
     * Otherwise place the new large page on the freelist
     */
    bin = PP_2_BIN(pplist);
    mnode = PP_2_MEM_NODE(pplist);
    mtype = PP_2_MTYPE(pplist);
    page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);

    page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
    return (NULL);

fail_promote:
    /*
     * A thread must have still been freeing or
     * reclaiming the page on the cachelist.
     * To prevent a deadlock undo what we have
     * done sofar and return failure. This
     * situation can only happen while promoting
     * PAGESIZE pages.
     */
    page_promote_err++;
    while (pplist) {
        pp = pplist;
        mach_page_sub(&pplist, pp);
        pp->p_szc = 0;
        bin = PP_2_BIN(pp);
        mtype = PP_2_MTYPE(pp);
        mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
        page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
    }
    return (NULL);

}

/*
 * Break up a large page into smaller size pages.
 * Pages involved are on the freelist before the call and may
 * be returned to the caller if requested, otherwise they will
 * be placed back on the freelist.
 * The caller is responsible for locking the freelist as well as any other
 * accounting which needs to be done for a returned page.
 * If flags is not PC_ALLOC, the color argument is ignored, and thus
 * technically, any value may be passed in but PC_NO_COLOR is the standard
 * which should be followed for clarity's sake.
 */
page_t *
page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
    int color, int flags)
{
    page_t  *pp, *pplist, *npplist;
    pgcnt_t npgs, n;
    uint_t  bin;
    uint_t  mtype;
    page_t  *ret_pp = NULL;

    ASSERT(cur_szc != 0);
    ASSERT(new_szc < cur_szc);

    pplist = page_numtopp_nolock(pfnum);
    ASSERT(pplist != NULL);

    ASSERT(pplist->p_szc == cur_szc);

    bin = PP_2_BIN(pplist);
    ASSERT(mnode == PP_2_MEM_NODE(pplist));
    mtype = PP_2_MTYPE(pplist);
    page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);

    CHK_LPG(pplist, cur_szc);
    page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);

    /*
     * Number of PAGESIZE pages for smaller new_szc
     * page.
     */
    npgs = page_get_pagecnt(new_szc);

    while (pplist) {
        pp = pplist;

        ASSERT(pp->p_szc == cur_szc);

        /*
         * We either break it up into PAGESIZE pages or larger.
         */
        if (npgs == 1) {    /* PAGESIZE case */
            mach_page_sub(&pplist, pp);
            ASSERT(pp->p_szc == cur_szc);
            ASSERT(new_szc == 0);
            ASSERT(mnode == PP_2_MEM_NODE(pp));
            pp->p_szc = new_szc;
            bin = PP_2_BIN(pp);
            if ((bin == color) && (flags == PC_ALLOC) &&
                (ret_pp == NULL) &&
                page_trylock_cons(pp, SE_EXCL)) {
                ret_pp = pp;
            } else {
                mtype = PP_2_MTYPE(pp);
                mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
                    mtype), pp);
                page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
            }
        } else {

            /*
             * Break down into smaller lists of pages.
             */
            page_list_break(&pplist, &npplist, npgs);

            pp = pplist;
            n = npgs;
            while (n--) {
                ASSERT(pp->p_szc == cur_szc);
                pp->p_szc = new_szc;
                pp = pp->p_next;
            }

            CHK_LPG(pplist, new_szc);

            bin = PP_2_BIN(pplist);
            ASSERT(mnode == PP_2_MEM_NODE(pp));
            if ((bin == color) && (flags == PC_ALLOC) &&
                (ret_pp == NULL) &&
                page_trylock_cons(pp, SE_EXCL)) {
                ret_pp = pp;
            } else {
                mtype = PP_2_MTYPE(pp);
                page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
                    bin, mtype), pplist);

                page_ctr_add(mnode, mtype, pplist,
                    PG_FREE_LIST);
            }
            pplist = npplist;
        }
    }
    return (ret_pp);
}

int mpss_coalesce_disable = 0;

/*
 * Coalesce free pages into a page of the given szc and color if possible.
 * Return the pointer to the page created, otherwise, return NULL.
 *
 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
 */
page_t *
page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
    int mtype, pfn_t pfnhi)
{
    int     r = szc;        /* region size */
    int mrange;
    uint_t  full, bin, color_mask, wrap = 0;
    pfn_t   pfnum, lo, hi;
    size_t  len, idx, idx0;
    pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
    page_t  *ret_pp;
    MEM_NODE_ITERATOR_DECL(it);
#if defined(__sparc)
    pfn_t pfnum0, nlo, nhi;
#endif

    if (mpss_coalesce_disable) {
        ASSERT(szc < MMU_PAGE_SIZES);
        VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
        return (NULL);
    }

    ASSERT(szc < mmu_page_sizes);
    color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
    ASSERT(ceq_mask <= color_mask);
    ASSERT(color <= color_mask);
    color &= ceq_mask;

    /* Prevent page_counters dynamic memory from being freed */
    rw_enter(&page_ctrs_rwlock[mnode], RW_READER);

    mrange = MTYPE_2_MRANGE(mnode, mtype);
    ASSERT(mrange < mnode_nranges[mnode]);
    VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);

    /* get pfn range for mtype */
    len = PAGE_COUNTERS_ENTRIES(mnode, r);
#if defined(__sparc)
    lo = PAGE_COUNTERS_BASE(mnode, r);
    hi = IDX_TO_PNUM(mnode, r, len);
#else
    MNODETYPE_2_PFN(mnode, mtype, lo, hi);
    hi++;
#endif

    /* use lower limit if given */
    if (pfnhi != PFNNULL && pfnhi < hi)
        hi = pfnhi;

    /* round to szcpgcnt boundaries */
    lo = P2ROUNDUP(lo, szcpgcnt);
    MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
    ASSERT(lo != (pfn_t)-1);
    hi = hi & ~(szcpgcnt - 1);

    /* set lo to the closest pfn of the right color */
    if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
        (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
        PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
            &it);
    }

    if (hi <= lo) {
        rw_exit(&page_ctrs_rwlock[mnode]);
        return (NULL);
    }

    full = FULL_REGION_CNT(r);

    /* calculate the number of page candidates and initial search index */
    bin = color;
    idx0 = (size_t)(-1);
    do {
        pgcnt_t acand;

        PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
        if (acand) {
            idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
                r, bin, mrange);
            idx0 = MIN(idx0, idx);
            cands += acand;
        }
        bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
    } while (bin != color);

    if (cands == 0) {
        VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
        rw_exit(&page_ctrs_rwlock[mnode]);
        return (NULL);
    }

    pfnum = IDX_TO_PNUM(mnode, r, idx0);
    if (pfnum < lo || pfnum >= hi) {
        pfnum = lo;
    } else {
        MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
        if (pfnum == (pfn_t)-1) {
            pfnum = lo;
            MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
            ASSERT(pfnum != (pfn_t)-1);
        } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
            (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
            /* invalid color, get the closest correct pfn */
            PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
                color_mask, &it);
            if (pfnum >= hi) {
                pfnum = lo;
                MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
            }
        }
    }

    /* set starting index */
    idx0 = PNUM_TO_IDX(mnode, r, pfnum);
    ASSERT(idx0 < len);

#if defined(__sparc)
    pfnum0 = pfnum;     /* page corresponding to idx0 */
    nhi = 0;        /* search kcage ranges */
#endif

    for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {

#if defined(__sparc)
        /*
         * Find lowest intersection of kcage ranges and mnode.
         * MTYPE_NORELOC means look in the cage, otherwise outside.
         */
        if (nhi <= pfnum) {
            if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
                (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
                goto wrapit;

            /* jump to the next page in the range */
            if (pfnum < nlo) {
                pfnum = P2ROUNDUP(nlo, szcpgcnt);
                MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
                idx = PNUM_TO_IDX(mnode, r, pfnum);
                if (idx >= len || pfnum >= hi)
                    goto wrapit;
                if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
                    ceq_mask)
                    goto next;
                if (interleaved_mnodes &&
                    PFN_2_MEM_NODE(pfnum) != mnode)
                    goto next;
            }
        }
#endif

        if (PAGE_COUNTERS(mnode, r, idx) != full)
            goto next;

        /*
         * RFE: For performance maybe we can do something less
         *  brutal than locking the entire freelist. So far
         *  this doesn't seem to be a performance problem?
         */
        page_freelist_lock(mnode);
        if (PAGE_COUNTERS(mnode, r, idx) == full) {
            ret_pp =
                page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
            if (ret_pp != NULL) {
                VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
                PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
                    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
                page_freelist_unlock(mnode);
                rw_exit(&page_ctrs_rwlock[mnode]);
#if defined(__sparc)
                if (PP_ISNORELOC(ret_pp)) {
                    pgcnt_t npgs;

                    npgs = page_get_pagecnt(ret_pp->p_szc);
                    kcage_freemem_sub(npgs);
                }
#endif
                return (ret_pp);
            }
        } else {
            VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
        }

        page_freelist_unlock(mnode);
        /*
         * No point looking for another page if we've
         * already tried all of the ones that
         * page_ctr_cands indicated.  Stash off where we left
         * off.
         * Note: this is not exact since we don't hold the
         * page_freelist_locks before we initially get the
         * value of cands for performance reasons, but should
         * be a decent approximation.
         */
        if (--cands == 0) {
            PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
                idx;
            break;
        }
next:
        PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
            color_mask, &it);
        idx = PNUM_TO_IDX(mnode, r, pfnum);
        if (idx >= len || pfnum >= hi) {
wrapit:
            pfnum = lo;
            MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
            idx = PNUM_TO_IDX(mnode, r, pfnum);
            wrap++;
#if defined(__sparc)
            nhi = 0;    /* search kcage ranges */
#endif
        }
    }

    rw_exit(&page_ctrs_rwlock[mnode]);
    VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
    return (NULL);
}

/*
 * For the given mnode, promote as many small pages to large pages as possible.
 * mnode can be -1, which means do them all
 */
void
page_freelist_coalesce_all(int mnode)
{
    int     r;      /* region size */
    int     idx, full;
    size_t  len;
    int doall = interleaved_mnodes || mnode < 0;
    int mlo = doall ? 0 : mnode;
    int mhi = doall ? max_mem_nodes : (mnode + 1);

    VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);

    if (mpss_coalesce_disable) {
        return;
    }

    /*
     * Lock the entire freelist and coalesce what we can.
     *
     * Always promote to the largest page possible
     * first to reduce the number of page promotions.
     */
    for (mnode = mlo; mnode < mhi; mnode++) {
        rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
        page_freelist_lock(mnode);
    }
    for (r = mmu_page_sizes - 1; r > 0; r--) {
        for (mnode = mlo; mnode < mhi; mnode++) {
            pgcnt_t cands = 0;
            int mrange, nranges = mnode_nranges[mnode];

            for (mrange = 0; mrange < nranges; mrange++) {
                PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
                if (cands != 0)
                    break;
            }
            if (cands == 0) {
                VM_STAT_ADD(vmm_vmstats.
                    page_ctrs_cands_skip_all);
                continue;
            }

            full = FULL_REGION_CNT(r);
            len  = PAGE_COUNTERS_ENTRIES(mnode, r);

            for (idx = 0; idx < len; idx++) {
                if (PAGE_COUNTERS(mnode, r, idx) == full) {
                    pfn_t pfnum =
                        IDX_TO_PNUM(mnode, r, idx);
                    int tmnode = interleaved_mnodes ?
                        PFN_2_MEM_NODE(pfnum) : mnode;

                    ASSERT(pfnum >=
                        mem_node_config[tmnode].physbase &&
                        pfnum <
                        mem_node_config[tmnode].physmax);

                    (void) page_promote(tmnode,
                        pfnum, r, PC_FREE, PC_MTYPE_ANY);
                }
            }
            /* shared hpm_counters covers all mnodes, so we quit */
            if (interleaved_mnodes)
                break;
        }
    }
    for (mnode = mlo; mnode < mhi; mnode++) {
        page_freelist_unlock(mnode);
        rw_exit(&page_ctrs_rwlock[mnode]);
    }
}

/*
 * This is where all polices for moving pages around
 * to different page size free lists is implemented.
 * Returns 1 on success, 0 on failure.
 *
 * So far these are the priorities for this algorithm in descending
 * order:
 *
 *  1) When servicing a request try to do so with a free page
 *     from next size up. Helps defer fragmentation as long
 *     as possible.
 *
 *  2) Page coalesce on demand. Only when a freelist
 *     larger than PAGESIZE is empty and step 1
 *     will not work since all larger size lists are
 *     also empty.
 *
 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
 */

page_t *
page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
    pfn_t pfnhi, page_list_walker_t *plw)
{
    uchar_t nszc = szc + 1;
    uint_t  bin, sbin, bin_prev;
    page_t  *pp, *firstpp;
    page_t  *ret_pp = NULL;
    uint_t  color_mask;

    if (nszc == mmu_page_sizes)
        return (NULL);

    ASSERT(nszc < mmu_page_sizes);
    color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
    bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
    bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
        PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);

    VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
    /*
     * First try to break up a larger page to fill current size freelist.
     */
    while (plw->plw_bins[nszc] != 0) {

        ASSERT(nszc < mmu_page_sizes);

        /*
         * If page found then demote it.
         */
        if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
            page_freelist_lock(mnode);
            firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);

            /*
             * If pfnhi is not PFNNULL, look for large page below
             * pfnhi. PFNNULL signifies no pfn requirement.
             */
            if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
                do {
                    pp = pp->p_vpnext;
                    if (pp == firstpp) {
                        pp = NULL;
                        break;
                    }
                } while (pp->p_pagenum >= pfnhi);
            }
            if (pp) {
                uint_t ccolor = page_correct_color(szc, nszc,
                    color, bin, plw->plw_ceq_mask[szc]);

                ASSERT(pp->p_szc == nszc);
                VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
                ret_pp = page_demote(mnode, pp->p_pagenum,
                    pp->p_szc, szc, ccolor, PC_ALLOC);
                if (ret_pp) {
                    page_freelist_unlock(mnode);
#if defined(__sparc)
                    if (PP_ISNORELOC(ret_pp)) {
                        pgcnt_t npgs;

                        npgs = page_get_pagecnt(
                            ret_pp->p_szc);
                        kcage_freemem_sub(npgs);
                    }
#endif
                    return (ret_pp);
                }
            }
            page_freelist_unlock(mnode);
        }

        /* loop through next size bins */
        bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
        plw->plw_bins[nszc]--;

        if (bin == sbin) {
            uchar_t nnszc = nszc + 1;

            /* we are done with this page size - check next */
            if (plw->plw_bins[nnszc] == 0)
                /* we have already checked next size bins */
                break;

            bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
            if (bin_prev != INVALID_COLOR) {
                bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
                if (!((bin ^ bin_prev) &
                    plw->plw_ceq_mask[nnszc]))
                    break;
            }
            ASSERT(nnszc < mmu_page_sizes);
            color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
            nszc = nnszc;
            ASSERT(nszc < mmu_page_sizes);
        }
    }

    return (ret_pp);
}

/*
 * Helper routine used only by the freelist code to lock
 * a page. If the page is a large page then it succeeds in
 * locking all the constituent pages or none at all.
 * Returns 1 on sucess, 0 on failure.
 */
static int
page_trylock_cons(page_t *pp, se_t se)
{
    page_t  *tpp, *first_pp = pp;

    /*
     * Fail if can't lock first or only page.
     */
    if (!page_trylock(pp, se)) {
        return (0);
    }

    /*
     * PAGESIZE: common case.
     */
    if (pp->p_szc == 0) {
        return (1);
    }

    /*
     * Large page case.
     */
    tpp = pp->p_next;
    while (tpp != pp) {
        if (!page_trylock(tpp, se)) {
            /*
             * On failure unlock what we have locked so far.
             * We want to avoid attempting to capture these
             * pages as the pcm mutex may be held which could
             * lead to a recursive mutex panic.
             */
            while (first_pp != tpp) {
                page_unlock_nocapture(first_pp);
                first_pp = first_pp->p_next;
            }
            return (0);
        }
        tpp = tpp->p_next;
    }
    return (1);
}

/*
 * init context for walking page lists
 * Called when a page of the given szc in unavailable. Sets markers
 * for the beginning of the search to detect when search has
 * completed a full cycle. Sets flags for splitting larger pages
 * and coalescing smaller pages. Page walking procedes until a page
 * of the desired equivalent color is found.
 */
void
page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
    int use_ceq, page_list_walker_t *plw)
{
    uint_t  nszc, ceq_mask, colors;
    uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;

    ASSERT(szc < mmu_page_sizes);
    colors = PAGE_GET_PAGECOLORS(szc);

    plw->plw_colors = colors;
    plw->plw_color_mask = colors - 1;
    plw->plw_bin_marker = plw->plw_bin0 = bin;
    plw->plw_bin_split_prev = bin;
    plw->plw_bin_step = (szc == 0) ? vac_colors : 1;

    /*
     * if vac aliasing is possible make sure lower order color
     * bits are never ignored
     */
    if (vac_colors > 1)
        ceq &= 0xf0;

    /*
     * calculate the number of non-equivalent colors and
     * color equivalency mask
     */
    plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
    ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
    ASSERT(plw->plw_ceq_dif > 0);
    plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);

    if (flags & PG_MATCH_COLOR) {
        if (cpu_page_colors <  0) {
            /*
             * this is a heterogeneous machine with different CPUs
             * having different size e$ (not supported for ni2/rock
             */
            uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
            cpucolors = MAX(cpucolors, 1);
            ceq_mask = plw->plw_color_mask & (cpucolors - 1);
            plw->plw_ceq_mask[szc] =
                MIN(ceq_mask, plw->plw_ceq_mask[szc]);
        }
        plw->plw_ceq_dif = 1;
    }

    /* we can split pages in the freelist, but not the cachelist */
    if (can_split) {
        plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;

        /* set next szc color masks and number of free list bins */
        for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
            plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
                plw->plw_ceq_mask[szc]);
            plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
        }
        plw->plw_ceq_mask[nszc] = INVALID_MASK;
        plw->plw_bins[nszc] = 0;

    } else {
        ASSERT(szc == 0);
        plw->plw_do_split = 0;
        plw->plw_bins[1] = 0;
        plw->plw_ceq_mask[1] = INVALID_MASK;
    }
}

/*
 * set mark to flag where next split should occur
 */
#define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {            \
    uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);               \
    uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);        \
    uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
    plw->plw_split_next =                            \
        INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);      \
    if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
        plw->plw_split_next =                        \
        INC_MASKED(plw->plw_split_next,                  \
            neq_mask, plw->plw_color_mask);              \
    }                                    \
}

uint_t
page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
{
    uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
    uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
    uchar_t nszc = szc + 1;

    nbin = ADD_MASKED(bin,
        plw->plw_bin_step, neq_mask, plw->plw_color_mask);

    if (plw->plw_do_split) {
        plw->plw_bin_split_prev = bin;
        PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
        plw->plw_do_split = 0;
    }

    if (szc == 0) {
        if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
            if (nbin == plw->plw_bin0 &&
                (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
                nbin = ADD_MASKED(nbin, plw->plw_bin_step,
                    neq_mask, plw->plw_color_mask);
                plw->plw_bin_split_prev = plw->plw_bin0;
            }

            if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
                plw->plw_bin_marker =
                    nbin = INC_MASKED(nbin, neq_mask,
                    plw->plw_color_mask);
                plw->plw_bin_split_prev = plw->plw_bin0;
                /*
                 * large pages all have the same vac color
                 * so by now we should be done with next
                 * size page splitting process
                 */
                ASSERT(plw->plw_bins[1] == 0);
                plw->plw_do_split = 0;
                return (nbin);
            }

        } else {
            uint_t bin_jump = (vac_colors == 1) ?
                (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;

            bin_jump &= ~(vac_colors - 1);

            nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
                plw->plw_color_mask);

            if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {

                plw->plw_bin_marker = nbin = nbin0;

                if (plw->plw_bins[nszc] != 0) {
                    /*
                     * check if next page size bin is the
                     * same as the next page size bin for
                     * bin0
                     */
                    nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
                        nbin);
                    bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
                        plw->plw_bin0);

                    if ((bin0_nsz ^ nbin_nsz) &
                        plw->plw_ceq_mask[nszc])
                        plw->plw_do_split = 1;
                }
                return (nbin);
            }
        }
    }

    if (plw->plw_bins[nszc] != 0) {
        nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
        if (!((plw->plw_split_next ^ nbin_nsz) &
            plw->plw_ceq_mask[nszc]))
            plw->plw_do_split = 1;
    }

    return (nbin);
}

page_t *
page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
    uint_t flags)
{
    kmutex_t        *pcm;
    page_t          *pp, *first_pp;
    uint_t          sbin;
    int         plw_initialized;
    page_list_walker_t  plw;

    ASSERT(szc < mmu_page_sizes);

    VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);

    MTYPE_START(mnode, mtype, flags);
    if (mtype < 0) {    /* mnode does not have memory in mtype range */
        VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
        return (NULL);
    }
try_again:

    plw_initialized = 0;
    plw.plw_ceq_dif = 1;

    /*
     * Only hold one freelist lock at a time, that way we
     * can start anywhere and not have to worry about lock
     * ordering.
     */
    for (plw.plw_count = 0;
        plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
        sbin = bin;
        do {
            if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
                goto bin_empty_1;

            pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
            mutex_enter(pcm);
            pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
            if (pp == NULL)
                goto bin_empty_0;

            /*
             * These were set before the page
             * was put on the free list,
             * they must still be set.
             */
            ASSERT(PP_ISFREE(pp));
            ASSERT(PP_ISAGED(pp));
            ASSERT(pp->p_vnode == NULL);
            ASSERT(pp->p_hash == NULL);
            ASSERT(pp->p_offset == (u_offset_t)-1);
            ASSERT(pp->p_szc == szc);
            ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);

            /*
             * Walk down the hash chain.
             * 8k pages are linked on p_next
             * and p_prev fields. Large pages
             * are a contiguous group of
             * constituent pages linked together
             * on their p_next and p_prev fields.
             * The large pages are linked together
             * on the hash chain using p_vpnext
             * p_vpprev of the base constituent
             * page of each large page.
             */
            first_pp = pp;
            while (!page_trylock_cons(pp, SE_EXCL)) {
                if (szc == 0) {
                    pp = pp->p_next;
                } else {
                    pp = pp->p_vpnext;
                }

                ASSERT(PP_ISFREE(pp));
                ASSERT(PP_ISAGED(pp));
                ASSERT(pp->p_vnode == NULL);
                ASSERT(pp->p_hash == NULL);
                ASSERT(pp->p_offset == (u_offset_t)-1);
                ASSERT(pp->p_szc == szc);
                ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);

                if (pp == first_pp)
                    goto bin_empty_0;
            }

            ASSERT(pp != NULL);
            ASSERT(mtype == PP_2_MTYPE(pp));
            ASSERT(pp->p_szc == szc);
            if (szc == 0) {
                page_sub(&PAGE_FREELISTS(mnode,
                    szc, bin, mtype), pp);
            } else {
                page_vpsub(&PAGE_FREELISTS(mnode,
                    szc, bin, mtype), pp);
                CHK_LPG(pp, szc);
            }
            page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);

            if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
                panic("free page is not. pp %p", (void *)pp);
            mutex_exit(pcm);

#if defined(__sparc)
            ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
                (flags & PG_NORELOC) == 0);

            if (PP_ISNORELOC(pp))
                kcage_freemem_sub(page_get_pagecnt(szc));
#endif
            VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
            return (pp);

bin_empty_0:
            mutex_exit(pcm);
bin_empty_1:
            if (plw_initialized == 0) {
                page_list_walk_init(szc, flags, bin, 1, 1,
                    &plw);
                plw_initialized = 1;
                ASSERT(plw.plw_colors <=
                    PAGE_GET_PAGECOLORS(szc));
                ASSERT(plw.plw_colors > 0);
                ASSERT((plw.plw_colors &
                    (plw.plw_colors - 1)) == 0);
                ASSERT(bin < plw.plw_colors);
                ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
            }
            /* calculate the next bin with equivalent color */
            bin = ADD_MASKED(bin, plw.plw_bin_step,
                plw.plw_ceq_mask[szc], plw.plw_color_mask);
        } while (sbin != bin);

        /*
         * color bins are all empty if color match. Try and
         * satisfy the request by breaking up or coalescing
         * pages from a different size freelist of the correct
         * color that satisfies the ORIGINAL color requested.
         * If that fails then try pages of the same size but
         * different colors assuming we are not called with
         * PG_MATCH_COLOR.
         */
        if (plw.plw_do_split &&
            (pp = page_freelist_split(szc, bin, mnode,
            mtype, PFNNULL, &plw)) != NULL)
            return (pp);

        if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
            bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
            return (pp);

        if (plw.plw_ceq_dif > 1)
            bin = page_list_walk_next_bin(szc, bin, &plw);
    }

    /* if allowed, cycle through additional mtypes */
    MTYPE_NEXT(mnode, mtype, flags);
    if (mtype >= 0)
        goto try_again;

    VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);

    return (NULL);
}

/*
 * Returns the count of free pages for 'pp' with size code 'szc'.
 * Note: This function does not return an exact value as the page freelist
 * locks are not held and thus the values in the page_counters may be
 * changing as we walk through the data.
 */
static int
page_freecnt(int mnode, page_t *pp, uchar_t szc)
{
    pgcnt_t pgfree;
    pgcnt_t cnt;
    ssize_t r = szc;    /* region size */
    ssize_t idx;
    int i;
    int full, range;

    /* Make sure pagenum passed in is aligned properly */
    ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
    ASSERT(szc > 0);

    /* Prevent page_counters dynamic memory from being freed */
    rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
    idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
    cnt = PAGE_COUNTERS(mnode, r, idx);
    pgfree = cnt << PNUM_SHIFT(r - 1);
    range = FULL_REGION_CNT(szc);

    /* Check for completely full region */
    if (cnt == range) {
        rw_exit(&page_ctrs_rwlock[mnode]);
        return (pgfree);
    }

    while (--r > 0) {
        idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
        full = FULL_REGION_CNT(r);
        for (i = 0; i < range; i++, idx++) {
            cnt = PAGE_COUNTERS(mnode, r, idx);
            /*
             * If cnt here is full, that means we have already
             * accounted for these pages earlier.
             */
            if (cnt != full) {
                pgfree += (cnt << PNUM_SHIFT(r - 1));
            }
        }
        range *= full;
    }
    rw_exit(&page_ctrs_rwlock[mnode]);
    return (pgfree);
}

/*
 * Called from page_geti_contig_pages to exclusively lock constituent pages
 * starting from 'spp' for page size code 'szc'.
 *
 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
 * region needs to be greater than or equal to the threshold.
 */
static int
page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
{
    pgcnt_t pgcnt = PNUM_SIZE(szc);
    pgcnt_t pgfree, i;
    page_t *pp;

    VM_STAT_ADD(vmm_vmstats.ptcp[szc]);


    if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
        goto skipptcpcheck;
    /*
     * check if there are sufficient free pages available before attempting
     * to trylock. Count is approximate as page counters can change.
     */
    pgfree = page_freecnt(mnode, spp, szc);

    /* attempt to trylock if there are sufficient already free pages */
    if (pgfree < pgcnt/ptcpthreshold) {
        VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
        return (0);
    }

skipptcpcheck:

    for (i = 0; i < pgcnt; i++) {
        pp = &spp[i];
        if (!page_trylock(pp, SE_EXCL)) {
            VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
            while (--i != (pgcnt_t)-1) {
                pp = &spp[i];
                ASSERT(PAGE_EXCL(pp));
                page_unlock_nocapture(pp);
            }
            return (0);
        }
        ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
        if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
            !PP_ISFREE(pp)) {
            VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
            ASSERT(i == 0);
            page_unlock_nocapture(pp);
            return (0);
        }
        if (PP_ISNORELOC(pp)) {
            VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
            while (i != (pgcnt_t)-1) {
                pp = &spp[i];
                ASSERT(PAGE_EXCL(pp));
                page_unlock_nocapture(pp);
                i--;
            }
            return (0);
        }
    }
    VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
    return (1);
}

/*
 * Claim large page pointed to by 'pp'. 'pp' is the starting set
 * of 'szc' constituent pages that had been locked exclusively previously.
 * Will attempt to relocate constituent pages in use.
 */
static page_t *
page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
{
    spgcnt_t pgcnt, npgs, i;
    page_t *targpp, *rpp, *hpp;
    page_t *replpp = NULL;
    page_t *pplist = NULL;

    ASSERT(pp != NULL);

    pgcnt = page_get_pagecnt(szc);
    while (pgcnt) {
        ASSERT(PAGE_EXCL(pp));
        ASSERT(!PP_ISNORELOC(pp));
        if (PP_ISFREE(pp)) {
            /*
             * If this is a PG_FREE_LIST page then its
             * size code can change underneath us due to
             * page promotion or demotion. As an optimzation
             * use page_list_sub_pages() instead of
             * page_list_sub().
             */
            if (PP_ISAGED(pp)) {
                page_list_sub_pages(pp, szc);
                if (pp->p_szc == szc) {
                    return (pp);
                }
                ASSERT(pp->p_szc < szc);
                npgs = page_get_pagecnt(pp->p_szc);
                hpp = pp;
                for (i = 0; i < npgs; i++, pp++) {
                    pp->p_szc = szc;
                }
                page_list_concat(&pplist, &hpp);
                pgcnt -= npgs;
                continue;
            }
            ASSERT(!PP_ISAGED(pp));
            ASSERT(pp->p_szc == 0);
            page_list_sub(pp, PG_CACHE_LIST);
            page_hashout(pp, NULL);
            PP_SETAGED(pp);
            pp->p_szc = szc;
            page_list_concat(&pplist, &pp);
            pp++;
            pgcnt--;
            continue;
        }
        npgs = page_get_pagecnt(pp->p_szc);

        /*
         * page_create_wait freemem accounting done by caller of
         * page_get_freelist and not necessary to call it prior to
         * calling page_get_replacement_page.
         *
         * page_get_replacement_page can call page_get_contig_pages
         * to acquire a large page (szc > 0); the replacement must be
         * smaller than the contig page size to avoid looping or
         * szc == 0 and PGI_PGCPSZC0 is set.
         */
        if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
            replpp = page_get_replacement_page(pp, NULL, 0);
            if (replpp) {
                npgs = page_get_pagecnt(pp->p_szc);
                ASSERT(npgs <= pgcnt);
                targpp = pp;
            }
        }

        /*
         * If replacement is NULL or do_page_relocate fails, fail
         * coalescing of pages.
         */
        if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
            &npgs, NULL) != 0)) {
            /*
             * Unlock un-processed target list
             */
            while (pgcnt--) {
                ASSERT(PAGE_EXCL(pp));
                page_unlock_nocapture(pp);
                pp++;
            }
            /*
             * Free the processed target list.
             */
            while (pplist) {
                pp = pplist;
                page_sub(&pplist, pp);
                ASSERT(PAGE_EXCL(pp));
                ASSERT(pp->p_szc == szc);
                ASSERT(PP_ISFREE(pp));
                ASSERT(PP_ISAGED(pp));
                pp->p_szc = 0;
                page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
                page_unlock_nocapture(pp);
            }

            if (replpp != NULL)
                page_free_replacement_page(replpp);

            return (NULL);
        }
        ASSERT(pp == targpp);

        /* LINTED */
        ASSERT(hpp = pp); /* That's right, it's an assignment */

        pp += npgs;
        pgcnt -= npgs;

        while (npgs--) {
            ASSERT(PAGE_EXCL(targpp));
            ASSERT(!PP_ISFREE(targpp));
            ASSERT(!PP_ISNORELOC(targpp));
            PP_SETFREE(targpp);
            ASSERT(PP_ISAGED(targpp));
            ASSERT(targpp->p_szc < szc || (szc == 0 &&
                (flags & PGI_PGCPSZC0)));
            targpp->p_szc = szc;
            targpp = targpp->p_next;

            rpp = replpp;
            ASSERT(rpp != NULL);
            page_sub(&replpp, rpp);
            ASSERT(PAGE_EXCL(rpp));
            ASSERT(!PP_ISFREE(rpp));
            page_unlock_nocapture(rpp);
        }
        ASSERT(targpp == hpp);
        ASSERT(replpp == NULL);
        page_list_concat(&pplist, &targpp);
    }
    CHK_LPG(pplist, szc);
    return (pplist);
}

/*
 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
 * of 0 means nothing left after trim.
 */
int
trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
{
    pfn_t   kcagepfn;
    int decr;
    int rc = 0;

    if (PP_ISNORELOC(mseg->pages)) {
        if (PP_ISNORELOC(mseg->epages - 1) == 0) {

            /* lower part of this mseg inside kernel cage */
            decr = kcage_current_pfn(&kcagepfn);

            /* kernel cage may have transitioned past mseg */
            if (kcagepfn >= mseg->pages_base &&
                kcagepfn < mseg->pages_end) {
                ASSERT(decr == 0);
                *lo = kcagepfn;
                *hi = MIN(pfnhi,
                    (mseg->pages_end - 1));
                rc = 1;
            }
        }
        /* else entire mseg in the cage */
    } else {
        if (PP_ISNORELOC(mseg->epages - 1)) {

            /* upper part of this mseg inside kernel cage */
            decr = kcage_current_pfn(&kcagepfn);

            /* kernel cage may have transitioned past mseg */
            if (kcagepfn >= mseg->pages_base &&
                kcagepfn < mseg->pages_end) {
                ASSERT(decr);
                *hi = kcagepfn;
                *lo = MAX(pfnlo, mseg->pages_base);
                rc = 1;
            }
        } else {
            /* entire mseg outside of kernel cage */
            *lo = MAX(pfnlo, mseg->pages_base);
            *hi = MIN(pfnhi, (mseg->pages_end - 1));
            rc = 1;
        }
    }
    return (rc);
}

/*
 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
 * page with size code 'szc'. Claiming such a page requires acquiring
 * exclusive locks on all constituent pages (page_trylock_contig_pages),
 * relocating pages in use and concatenating these constituent pages into a
 * large page.
 *
 * The page lists do not have such a large page and page_freelist_split has
 * already failed to demote larger pages and/or coalesce smaller free pages.
 *
 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
 * pages with the same color as 'bin'.
 *
 * 'pfnflag' specifies the subset of the pfn range to search.
 */


static page_t *
page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
    pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
{
    struct memseg *mseg;
    pgcnt_t szcpgcnt = page_get_pagecnt(szc);
    pgcnt_t szcpgmask = szcpgcnt - 1;
    pfn_t   randpfn;
    page_t *pp, *randpp, *endpp;
    uint_t colors, ceq_mask;
    /* LINTED : set but not used in function */
    uint_t color_mask;
    pfn_t hi, lo;
    uint_t skip;
    MEM_NODE_ITERATOR_DECL(it);

    ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));

    if ((pfnhi - pfnlo) + 1 < szcpgcnt)
        return (NULL);

    ASSERT(szc < mmu_page_sizes);

    colors = PAGE_GET_PAGECOLORS(szc);
    color_mask = colors - 1;
    if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
        uchar_t ceq = colorequivszc[szc];
        uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));

        ASSERT(ceq_dif > 0);
        ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
    } else {
        ceq_mask = 0;
    }

    ASSERT(bin < colors);

    /* clear "non-significant" color bits */
    bin &= ceq_mask;

    /*
     * trim the pfn range to search based on pfnflag. pfnflag is set
     * when there have been previous page_get_contig_page failures to
     * limit the search.
     *
     * The high bit in pfnflag specifies the number of 'slots' in the
     * pfn range and the remainder of pfnflag specifies which slot.
     * For example, a value of 1010b would mean the second slot of
     * the pfn range that has been divided into 8 slots.
     */
    if (pfnflag > 1) {
        int slots = 1 << (highbit(pfnflag) - 1);
        int slotid = pfnflag & (slots - 1);
        pgcnt_t szcpages;
        int slotlen;

        pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
        pfnhi = pfnhi & ~(szcpgcnt - 1);

        szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
        slotlen = howmany(szcpages, slots);
        pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
        ASSERT(pfnlo < pfnhi);
        if (pfnhi > pfnlo + (slotlen * szcpgcnt))
            pfnhi = pfnlo + (slotlen * szcpgcnt);
    }

    memsegs_lock(0);

    /*
     * loop through memsegs to look for contig page candidates
     */

    for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
        if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
            /* no overlap */
            continue;
        }

        if (mseg->pages_end - mseg->pages_base < szcpgcnt)
            /* mseg too small */
            continue;

        /* trim off kernel cage pages from pfn range */
        if (kcage_on) {
            if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
                continue;
        } else {
            lo = MAX(pfnlo, mseg->pages_base);
            hi = MIN(pfnhi, (mseg->pages_end - 1));
        }

        /* round to szcpgcnt boundaries */
        lo = P2ROUNDUP(lo, szcpgcnt);
        MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
        hi = hi & ~(szcpgcnt - 1);

        if (hi <= lo)
            continue;

        /*
         * set lo to point to the pfn for the desired bin. Large
         * page sizes may only have a single page color
         */
        skip = szcpgcnt;
        if (ceq_mask > 0 || interleaved_mnodes) {
            /* set lo to point at appropriate color */
            if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
                (interleaved_mnodes &&
                PFN_2_MEM_NODE(lo) != mnode)) {
                PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
                    color_mask, &it);
            }
            if (hi <= lo)
                /* mseg cannot satisfy color request */
                continue;
        }

        /* randomly choose a point between lo and hi to begin search */

        randpfn = (pfn_t)GETTICK();
        randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
        MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
        if (ceq_mask || interleaved_mnodes) {
            if (randpfn != (pfn_t)-1)
                PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
                    ceq_mask, color_mask, &it);
            if (randpfn >= hi) {
                randpfn = lo;
                MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
            }
        }
        randpp = mseg->pages + (randpfn - mseg->pages_base);

        ASSERT(randpp->p_pagenum == randpfn);

        pp = randpp;
        endpp =  mseg->pages + (hi - mseg->pages_base);

        ASSERT(randpp + szcpgcnt <= endpp);

        do {
            ASSERT(!(pp->p_pagenum & szcpgmask));
            ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);

            if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
                /* pages unlocked by page_claim on failure */
                if (page_claim_contig_pages(pp, szc, flags)) {
                    memsegs_unlock(0);
                    return (pp);
                }
            }

            if (ceq_mask == 0 && !interleaved_mnodes) {
                pp += skip;
            } else {
                pfn_t pfn = pp->p_pagenum;

                PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
                    ceq_mask, color_mask, &it);
                if (pfn == (pfn_t)-1) {
                    pp = endpp;
                } else {
                    pp = mseg->pages +
                        (pfn - mseg->pages_base);
                }
            }
            if (pp >= endpp) {
                /* start from the beginning */
                MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
                pp = mseg->pages + (lo - mseg->pages_base);
                ASSERT(pp->p_pagenum == lo);
                ASSERT(pp + szcpgcnt <= endpp);
            }
        } while (pp != randpp);
    }
    memsegs_unlock(0);
    return (NULL);
}


/*
 * controlling routine that searches through physical memory in an attempt to
 * claim a large page based on the input parameters.
 * on the page free lists.
 *
 * calls page_geti_contig_pages with an initial pfn range from the mnode
 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
 * that overlaps with the kernel cage or does not match the requested page
 * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
 * page_geti_contig_pages may further limit the search range based on
 * previous failure counts (pgcpfailcnt[]).
 *
 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
 * pagesize page that satisfies mtype.
 */
page_t *
page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
    uint_t flags)
{
    pfn_t       pfnlo, pfnhi;   /* contig pages pfn range */
    page_t      *pp;
    pgcnt_t     pfnflag = 0;    /* no limit on search if 0 */

    VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);

    /* no allocations from cage */
    flags |= PGI_NOCAGE;

    /* LINTED */
    MTYPE_START(mnode, mtype, flags);
    if (mtype < 0) {    /* mnode does not have memory in mtype range */
        VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
        return (NULL);
    }

    ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));

    /* do not limit search and ignore color if hi pri */

    if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
        pfnflag = pgcpfailcnt[szc];

    /* remove color match to improve chances */

    if (flags & PGI_PGCPHIPRI || pfnflag)
        flags &= ~PG_MATCH_COLOR;

    do {
        /* get pfn range based on mnode and mtype */
        MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);

        ASSERT(pfnhi >= pfnlo);

        pp = page_geti_contig_pages(mnode, bin, szc, flags,
            pfnlo, pfnhi, pfnflag);

        if (pp != NULL) {
            pfnflag = pgcpfailcnt[szc];
            if (pfnflag) {
                /* double the search size */
                pgcpfailcnt[szc] = pfnflag >> 1;
            }
            VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
            return (pp);
        }
        MTYPE_NEXT(mnode, mtype, flags);
    } while (mtype >= 0);

    VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
    return (NULL);
}


/*
 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
 *
 * Does its own locking and accounting.
 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
 * pages of the proper color even if there are pages of a different color.
 *
 * Finds a page, removes it, THEN locks it.
 */

/*ARGSUSED*/
page_t *
page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
    caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
{
    struct as   *as = seg->s_as;
    page_t      *pp = NULL;
    ulong_t     bin;
    uchar_t     szc;
    int     mnode;
    int     mtype;
    page_t      *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
    lgrp_mnode_cookie_t lgrp_cookie;

    page_get_func = page_get_mnode_freelist;

    /*
     * If we aren't passed a specific lgroup, or passed a freed lgrp
     * assume we wish to allocate near to the current thread's home.
     */
    if (!LGRP_EXISTS(lgrp))
        lgrp = lgrp_home_lgrp();

    if (kcage_on) {
        if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
            kcage_freemem < kcage_throttlefree + btop(size) &&
            curthread != kcage_cageout_thread) {
            /*
             * Set a "reserve" of kcage_throttlefree pages for
             * PG_PANIC and cageout thread allocations.
             *
             * Everybody else has to serialize in
             * page_create_get_something() to get a cage page, so
             * that we don't deadlock cageout!
             */
            return (NULL);
        }
    } else {
        flags &= ~PG_NORELOC;
        flags |= PGI_NOCAGE;
    }

    /* LINTED */
    MTYPE_INIT(mtype, vp, vaddr, flags, size);

    /*
     * Convert size to page size code.
     */
    if ((szc = page_szc(size)) == (uchar_t)-1)
        panic("page_get_freelist: illegal page size request");
    ASSERT(szc < mmu_page_sizes);

    VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);

    /* LINTED */
    AS_2_BIN(as, seg, vp, vaddr, bin, szc);

    ASSERT(bin < PAGE_GET_PAGECOLORS(szc));

    /*
     * Try to get a local page first, but try remote if we can't
     * get a page of the right color.
     */
pgretry:
    LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
    while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
        pp = page_get_func(mnode, bin, mtype, szc, flags);
        if (pp != NULL) {
            VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
            DTRACE_PROBE4(page__get,
                lgrp_t *, lgrp,
                int, mnode,
                ulong_t, bin,
                uint_t, flags);
            return (pp);
        }
    }
    ASSERT(pp == NULL);

    /*
     * for non-SZC0 PAGESIZE requests, check cachelist before checking
     * remote free lists.  Caller expected to call page_get_cachelist which
     * will check local cache lists and remote free lists.
     */
    if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
        VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
        return (NULL);
    }

    ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));

    lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);

    if (!(flags & PG_LOCAL)) {
        /*
         * Try to get a non-local freelist page.
         */
        LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
        while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
            pp = page_get_func(mnode, bin, mtype, szc, flags);
            if (pp != NULL) {
                DTRACE_PROBE4(page__get,
                    lgrp_t *, lgrp,
                    int, mnode,
                    ulong_t, bin,
                    uint_t, flags);
                VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
                return (pp);
            }
        }
        ASSERT(pp == NULL);
    }

    /*
     * when the cage is off chances are page_get_contig_pages() will fail
     * to lock a large page chunk therefore when the cage is off it's not
     * called by default.  this can be changed via /etc/system.
     *
     * page_get_contig_pages() also called to acquire a base pagesize page
     * for page_create_get_something().
     */
    if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
        (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
        (page_get_func != page_get_contig_pages)) {

        VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
        page_get_func = page_get_contig_pages;
        goto pgretry;
    }

    if (!(flags & PG_LOCAL) && pgcplimitsearch &&
        page_get_func == page_get_contig_pages)
        SETPGCPFAILCNT(szc);

    VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
    return (NULL);
}

/*
 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
 *
 * Does its own locking.
 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
 * pages of the proper color even if there are pages of a different color.
 * Otherwise, scan the bins for ones with pages.  For each bin with pages,
 * try to lock one of them.  If no page can be locked, try the
 * next bin.  Return NULL if a page can not be found and locked.
 *
 * Finds a pages, trys to lock it, then removes it.
 */

/*ARGSUSED*/
page_t *
page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
    caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
{
    page_t      *pp;
    struct as   *as = seg->s_as;
    ulong_t     bin;
    /*LINTED*/
    int     mnode;
    int     mtype;
    lgrp_mnode_cookie_t lgrp_cookie;

    /*
     * If we aren't passed a specific lgroup, or pasased a freed lgrp
     * assume we wish to allocate near to the current thread's home.
     */
    if (!LGRP_EXISTS(lgrp))
        lgrp = lgrp_home_lgrp();

    if (!kcage_on) {
        flags &= ~PG_NORELOC;
        flags |= PGI_NOCAGE;
    }

    if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
        kcage_freemem <= kcage_throttlefree) {
        /*
         * Reserve kcage_throttlefree pages for critical kernel
         * threads.
         *
         * Everybody else has to go to page_create_get_something()
         * to get a cage page, so we don't deadlock cageout.
         */
        return (NULL);
    }

    /* LINTED */
    AS_2_BIN(as, seg, vp, vaddr, bin, 0);

    ASSERT(bin < PAGE_GET_PAGECOLORS(0));

    /* LINTED */
    MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);

    VM_STAT_ADD(vmm_vmstats.pgc_alloc);

    /*
     * Try local cachelists first
     */
    LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
    while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
        pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
        if (pp != NULL) {
            VM_STAT_ADD(vmm_vmstats.pgc_allocok);
            DTRACE_PROBE4(page__get,
                lgrp_t *, lgrp,
                int, mnode,
                ulong_t, bin,
                uint_t, flags);
            return (pp);
        }
    }

    lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);

    /*
     * Try freelists/cachelists that are farther away
     * This is our only chance to allocate remote pages for PAGESIZE
     * requests.
     */
    LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
    while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
        pp = page_get_mnode_freelist(mnode, bin, mtype,
            0, flags);
        if (pp != NULL) {
            VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
            DTRACE_PROBE4(page__get,
                lgrp_t *, lgrp,
                int, mnode,
                ulong_t, bin,
                uint_t, flags);
            return (pp);
        }
        pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
        if (pp != NULL) {
            VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
            DTRACE_PROBE4(page__get,
                lgrp_t *, lgrp,
                int, mnode,
                ulong_t, bin,
                uint_t, flags);
            return (pp);
        }
    }

    VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
    return (NULL);
}

page_t *
page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
{
    kmutex_t        *pcm;
    page_t          *pp, *first_pp;
    uint_t          sbin;
    int         plw_initialized;
    page_list_walker_t  plw;

    VM_STAT_ADD(vmm_vmstats.pgmc_alloc);

    /* LINTED */
    MTYPE_START(mnode, mtype, flags);
    if (mtype < 0) {    /* mnode does not have memory in mtype range */
        VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
        return (NULL);
    }

try_again:

    plw_initialized = 0;
    plw.plw_ceq_dif = 1;

    /*
     * Only hold one cachelist lock at a time, that way we
     * can start anywhere and not have to worry about lock
     * ordering.
     */

    for (plw.plw_count = 0;
        plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
        sbin = bin;
        do {

            if (!PAGE_CACHELISTS(mnode, bin, mtype))
                goto bin_empty_1;
            pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
            mutex_enter(pcm);
            pp = PAGE_CACHELISTS(mnode, bin, mtype);
            if (pp == NULL)
                goto bin_empty_0;

            first_pp = pp;
            ASSERT(pp->p_vnode);
            ASSERT(PP_ISAGED(pp) == 0);
            ASSERT(pp->p_szc == 0);
            ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
            while (!page_trylock(pp, SE_EXCL)) {
                pp = pp->p_next;
                ASSERT(pp->p_szc == 0);
                if (pp == first_pp) {
                    /*
                     * We have searched the complete list!
                     * And all of them (might only be one)
                     * are locked. This can happen since
                     * these pages can also be found via
                     * the hash list. When found via the
                     * hash list, they are locked first,
                     * then removed. We give up to let the
                     * other thread run.
                     */
                    pp = NULL;
                    break;
                }
                ASSERT(pp->p_vnode);
                ASSERT(PP_ISFREE(pp));
                ASSERT(PP_ISAGED(pp) == 0);
                ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
                    mnode);
            }

            if (pp) {
                page_t  **ppp;
                /*
                 * Found and locked a page.
                 * Pull it off the list.
                 */
                ASSERT(mtype == PP_2_MTYPE(pp));
                ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
                page_sub(ppp, pp);
                /*
                 * Subtract counters before releasing pcm mutex
                 * to avoid a race with page_freelist_coalesce
                 * and page_freelist_split.
                 */
                page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
                mutex_exit(pcm);
                ASSERT(pp->p_vnode);
                ASSERT(PP_ISAGED(pp) == 0);
#if defined(__sparc)
                ASSERT(!kcage_on ||
                    (flags & PG_NORELOC) == 0 ||
                    PP_ISNORELOC(pp));
                if (PP_ISNORELOC(pp)) {
                    kcage_freemem_sub(1);
                }
#endif
                VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
                return (pp);
            }
bin_empty_0:
            mutex_exit(pcm);
bin_empty_1:
            if (plw_initialized == 0) {
                page_list_walk_init(0, flags, bin, 0, 1, &plw);
                plw_initialized = 1;
            }
            /* calculate the next bin with equivalent color */
            bin = ADD_MASKED(bin, plw.plw_bin_step,
                plw.plw_ceq_mask[0], plw.plw_color_mask);
        } while (sbin != bin);

        if (plw.plw_ceq_dif > 1)
            bin = page_list_walk_next_bin(0, bin, &plw);
    }

    MTYPE_NEXT(mnode, mtype, flags);
    if (mtype >= 0)
        goto try_again;

    VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
    return (NULL);
}

#ifdef DEBUG
#define REPL_PAGE_STATS
#endif /* DEBUG */

#ifdef REPL_PAGE_STATS
struct repl_page_stats {
    uint_t  ngets;
    uint_t  ngets_noreloc;
    uint_t  npgr_noreloc;
    uint_t  nnopage_first;
    uint_t  nnopage;
    uint_t  nhashout;
    uint_t  nnofree;
    uint_t  nnext_pp;
} repl_page_stats;
#define REPL_STAT_INCR(v)   atomic_add_32(&repl_page_stats.v, 1)
#else /* REPL_PAGE_STATS */
#define REPL_STAT_INCR(v)
#endif /* REPL_PAGE_STATS */

int pgrppgcp;

/*
 * The freemem accounting must be done by the caller.
 * First we try to get a replacement page of the same size as like_pp,
 * if that is not possible, then we just get a set of discontiguous
 * PAGESIZE pages.
 */
page_t *
page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
    uint_t pgrflags)
{
    page_t      *like_pp;
    page_t      *pp, *pplist;
    page_t      *pl = NULL;
    ulong_t     bin;
    int     mnode, page_mnode;
    int     szc;
    spgcnt_t    npgs, pg_cnt;
    pfn_t       pfnum;
    int     mtype;
    int     flags = 0;
    lgrp_mnode_cookie_t lgrp_cookie;
    lgrp_t      *lgrp;

    REPL_STAT_INCR(ngets);
    like_pp = orig_like_pp;
    ASSERT(PAGE_EXCL(like_pp));

    szc = like_pp->p_szc;
    npgs = page_get_pagecnt(szc);
    /*
     * Now we reset like_pp to the base page_t.
     * That way, we won't walk past the end of this 'szc' page.
     */
    pfnum = PFN_BASE(like_pp->p_pagenum, szc);
    like_pp = page_numtopp_nolock(pfnum);
    ASSERT(like_pp->p_szc == szc);

    if (PP_ISNORELOC(like_pp)) {
        ASSERT(kcage_on);
        REPL_STAT_INCR(ngets_noreloc);
        flags = PGI_RELOCONLY;
    } else if (pgrflags & PGR_NORELOC) {
        ASSERT(kcage_on);
        REPL_STAT_INCR(npgr_noreloc);
        flags = PG_NORELOC;
    }

    /*
     * Kernel pages must always be replaced with the same size
     * pages, since we cannot properly handle demotion of kernel
     * pages.
     */
    if (PP_ISKAS(like_pp))
        pgrflags |= PGR_SAMESZC;

    /* LINTED */
    MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);

    while (npgs) {
        pplist = NULL;
        for (;;) {
            pg_cnt = page_get_pagecnt(szc);
            bin = PP_2_BIN(like_pp);
            ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
            ASSERT(pg_cnt <= npgs);

            /*
             * If an lgroup was specified, try to get the
             * page from that lgroup.
             * NOTE: Must be careful with code below because
             *   lgroup may disappear and reappear since there
             *   is no locking for lgroup here.
             */
            if (LGRP_EXISTS(lgrp_target)) {
                /*
                 * Keep local variable for lgroup separate
                 * from lgroup argument since this code should
                 * only be exercised when lgroup argument
                 * exists....
                 */
                lgrp = lgrp_target;

                /* Try the lgroup's freelists first */
                LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
                    LGRP_SRCH_LOCAL);
                while ((pplist == NULL) &&
                    (mnode = lgrp_memnode_choose(&lgrp_cookie))
                    != -1) {
                    pplist =
                        page_get_mnode_freelist(mnode, bin,
                        mtype, szc, flags);
                }

                /*
                 * Now try it's cachelists if this is a
                 * small page. Don't need to do it for
                 * larger ones since page_freelist_coalesce()
                 * already failed.
                 */
                if (pplist != NULL || szc != 0)
                    break;

                /* Now try it's cachelists */
                LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
                    LGRP_SRCH_LOCAL);

                while ((pplist == NULL) &&
                    (mnode = lgrp_memnode_choose(&lgrp_cookie))
                    != -1) {
                    pplist =
                        page_get_mnode_cachelist(bin, flags,
                        mnode, mtype);
                }
                if (pplist != NULL) {
                    page_hashout(pplist, NULL);
                    PP_SETAGED(pplist);
                    REPL_STAT_INCR(nhashout);
                    break;
                }
                /* Done looking in this lgroup. Bail out. */
                break;
            }

            /*
             * No lgroup was specified (or lgroup was removed by
             * DR, so just try to get the page as close to
             * like_pp's mnode as possible.
             * First try the local freelist...
             */
            mnode = PP_2_MEM_NODE(like_pp);
            pplist = page_get_mnode_freelist(mnode, bin,
                mtype, szc, flags);
            if (pplist != NULL)
                break;

            REPL_STAT_INCR(nnofree);

            /*
             * ...then the local cachelist. Don't need to do it for
             * larger pages cause page_freelist_coalesce() already
             * failed there anyway.
             */
            if (szc == 0) {
                pplist = page_get_mnode_cachelist(bin, flags,
                    mnode, mtype);
                if (pplist != NULL) {
                    page_hashout(pplist, NULL);
                    PP_SETAGED(pplist);
                    REPL_STAT_INCR(nhashout);
                    break;
                }
            }

            /* Now try remote freelists */
            page_mnode = mnode;
            lgrp =
                lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
            LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
                LGRP_SRCH_HIER);
            while (pplist == NULL &&
                (mnode = lgrp_memnode_choose(&lgrp_cookie))
                != -1) {
                /*
                 * Skip local mnode.
                 */
                if ((mnode == page_mnode) ||
                    (mem_node_config[mnode].exists == 0))
                    continue;

                pplist = page_get_mnode_freelist(mnode,
                    bin, mtype, szc, flags);
            }

            if (pplist != NULL)
                break;


            /* Now try remote cachelists */
            LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
                LGRP_SRCH_HIER);
            while (pplist == NULL && szc == 0) {
                mnode = lgrp_memnode_choose(&lgrp_cookie);
                if (mnode == -1)
                    break;
                /*
                 * Skip local mnode.
                 */
                if ((mnode == page_mnode) ||
                    (mem_node_config[mnode].exists == 0))
                    continue;

                pplist = page_get_mnode_cachelist(bin,
                    flags, mnode, mtype);

                if (pplist != NULL) {
                    page_hashout(pplist, NULL);
                    PP_SETAGED(pplist);
                    REPL_STAT_INCR(nhashout);
                    break;
                }
            }

            /*
             * Break out of while loop under the following cases:
             * - If we successfully got a page.
             * - If pgrflags specified only returning a specific
             *   page size and we could not find that page size.
             * - If we could not satisfy the request with PAGESIZE
             *   or larger pages.
             */
            if (pplist != NULL || szc == 0)
                break;

            if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
                /* try to find contig page */

                LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
                    LGRP_SRCH_HIER);

                while ((pplist == NULL) &&
                    (mnode =
                    lgrp_memnode_choose(&lgrp_cookie))
                    != -1) {
                    pplist = page_get_contig_pages(
                        mnode, bin, mtype, szc,
                        flags | PGI_PGCPHIPRI);
                }
                break;
            }

            /*
             * The correct thing to do here is try the next
             * page size down using szc--. Due to a bug
             * with the processing of HAT_RELOAD_SHARE
             * where the sfmmu_ttecnt arrays of all
             * hats sharing an ISM segment don't get updated,
             * using intermediate size pages for relocation
             * can lead to continuous page faults.
             */
            szc = 0;
        }

        if (pplist != NULL) {
            DTRACE_PROBE4(page__get,
                lgrp_t *, lgrp,
                int, mnode,
                ulong_t, bin,
                uint_t, flags);

            while (pplist != NULL && pg_cnt--) {
                ASSERT(pplist != NULL);
                pp = pplist;
                page_sub(&pplist, pp);
                PP_CLRFREE(pp);
                PP_CLRAGED(pp);
                page_list_concat(&pl, &pp);
                npgs--;
                like_pp = like_pp + 1;
                REPL_STAT_INCR(nnext_pp);
            }
            ASSERT(pg_cnt == 0);
        } else {
            break;
        }
    }

    if (npgs) {
        /*
         * We were unable to allocate the necessary number
         * of pages.
         * We need to free up any pl.
         */
        REPL_STAT_INCR(nnopage);
        page_free_replacement_page(pl);
        return (NULL);
    } else {
        return (pl);
    }
}

/*
 * demote a free large page to it's constituent pages
 */
void
page_demote_free_pages(page_t *pp)
{

    int mnode;

    ASSERT(pp != NULL);
    ASSERT(PAGE_LOCKED(pp));
    ASSERT(PP_ISFREE(pp));
    ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);

    mnode = PP_2_MEM_NODE(pp);
    page_freelist_lock(mnode);
    if (pp->p_szc != 0) {
        (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
            pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
    }
    page_freelist_unlock(mnode);
    ASSERT(pp->p_szc == 0);
}

/*
 * Factor in colorequiv to check additional 'equivalent' bins.
 * colorequiv may be set in /etc/system
 */
void
page_set_colorequiv_arr(void)
{
    if (colorequiv > 1) {
        int i;
        uint_t sv_a = lowbit(colorequiv) - 1;

        if (sv_a > 15)
            sv_a = 15;

        for (i = 0; i < MMU_PAGE_SIZES; i++) {
            uint_t colors;
            uint_t a = sv_a;

            if ((colors = hw_page_array[i].hp_colors) <= 1) {
                continue;
            }
            while ((colors >> a) == 0)
                a--;
            if ((a << 4) > colorequivszc[i]) {
                colorequivszc[i] = (a << 4);
            }
        }
    }
}