common/vm/vm_seg.c

	vm_seg.c revision 0209230bf1261579beab4f55226bb509e6b850cb
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*  Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*    All Rights Reserved   */

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * VM - segment management.
 */

#include <sys/types.h>
#include <sys/inttypes.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/vmsystm.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/callb.h>
#include <sys/mem_config.h>
#include <sys/mman.h>

#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
#include <vm/seg_spt.h>
#include <vm/seg_vn.h>
/*
 * kstats for segment advise
 */
segadvstat_t segadvstat = {
    { "MADV_FREE_hit",  KSTAT_DATA_ULONG },
    { "MADV_FREE_miss", KSTAT_DATA_ULONG },
};

kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);

/* #define  PDEBUG */
#if defined(PDEBUG) || defined(lint) || defined(__lint)
int pdebug = 0;
#else
#define pdebug      0
#endif  /* PDEBUG */

#define PPRINTF             if (pdebug) printf
#define PPRINT(x)           PPRINTF(x)
#define PPRINT1(x, a)           PPRINTF(x, a)
#define PPRINT2(x, a, b)        PPRINTF(x, a, b)
#define PPRINT3(x, a, b, c)     PPRINTF(x, a, b, c)
#define PPRINT4(x, a, b, c, d)      PPRINTF(x, a, b, c, d)
#define PPRINT5(x, a, b, c, d, e)   PPRINTF(x, a, b, c, d, e)

#define P_HASHMASK      (p_hashsize - 1)
#define P_BASESHIFT     6

/*
 * entry in the segment page cache
 */
struct seg_pcache {
    struct seg_pcache *p_hnext; /* list for hashed blocks */
    struct seg_pcache *p_hprev;
    int     p_active;   /* active count */
    int     p_ref;      /* ref bit */
    size_t      p_len;      /* segment length */
    caddr_t     p_addr;     /* base address */
    struct seg  *p_seg;     /* segment */
    struct page **p_pp;     /* pp shadow list */
    enum seg_rw p_rw;       /* rw */
    uint_t      p_flags;    /* bit flags */
    int     (*p_callback)(struct seg *, caddr_t, size_t,
                struct page **, enum seg_rw);
};

struct seg_phash {
    struct seg_pcache *p_hnext; /* list for hashed blocks */
    struct seg_pcache *p_hprev;
    int p_qlen;         /* Q length */
    kmutex_t p_hmutex;      /* protects hash bucket */
};

static int seg_preap_time = 20; /* reclaim every 20 secs */
static int seg_pmaxqlen = 5;    /* max Q length in hash list */
static int seg_ppcount = 5; /* max # of purges per reclaim interval */
static int seg_plazy = 1;   /* if 1, pages are cached after pageunlock */
static pgcnt_t seg_pwindow; /* max # of pages that can be cached */
static pgcnt_t seg_plocked; /* # of pages which are cached by pagelock */
static pgcnt_t seg_plocked_window; /* # pages from window */
int seg_preapahead;

static uint_t seg_pdisable = 0; /* if not 0, caching temporarily disabled */

static int seg_pupdate_active = 1;  /* background reclaim thread */
static clock_t seg_preap_interval;  /* reap interval in ticks */

static kmutex_t seg_pcache; /* protects the whole pagelock cache */
static kmutex_t seg_pmem;   /* protects window counter */
static ksema_t seg_psaync_sem;  /* sema for reclaim thread */
static struct seg_phash *p_hashtab;
static int p_hashsize = 0;

#define p_hash(seg) \
    (P_HASHMASK & \
    ((uintptr_t)(seg) >> P_BASESHIFT))

#define p_match(pcp, seg, addr, len, rw) \
    (((pcp)->p_seg == (seg) && \
    (pcp)->p_addr == (addr) && \
    (pcp)->p_rw == (rw) && \
    (pcp)->p_len == (len)) ? 1 : 0)

#define p_match_pp(pcp, seg, addr, len, pp, rw) \
    (((pcp)->p_seg == (seg) && \
    (pcp)->p_addr == (addr) && \
    (pcp)->p_pp == (pp) && \
    (pcp)->p_rw == (rw) && \
    (pcp)->p_len == (len)) ? 1 : 0)


/*
 * lookup an address range in pagelock cache. Return shadow list
 * and bump up active count.
 */
struct page **
seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
{
    struct seg_pcache *pcp;
    struct seg_phash *hp;

    /*
     * Skip pagelock cache, while DR is in progress or
     * seg_pcache is off.
     */
    if (seg_pdisable || seg_plazy == 0) {
        return (NULL);
    }

    hp = &p_hashtab[p_hash(seg)];
    mutex_enter(&hp->p_hmutex);
    for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
        pcp = pcp->p_hnext) {
        if (p_match(pcp, seg, addr, len, rw)) {
            pcp->p_active++;
            mutex_exit(&hp->p_hmutex);

            PPRINT5("seg_plookup hit: seg %p, addr %p, "
                "len %lx, count %d, pplist %p \n",
                (void *)seg, (void *)addr, len, pcp->p_active,
                (void *)pcp->p_pp);

            return (pcp->p_pp);
        }
    }
    mutex_exit(&hp->p_hmutex);

    PPRINT("seg_plookup miss:\n");

    return (NULL);
}

/*
 * mark address range inactive. If the cache is off or the address
 * range is not in the cache we call the segment driver to reclaim
 * the pages. Otherwise just decrement active count and set ref bit.
 */
void
seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
    enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
    struct page **, enum seg_rw))
{
    struct seg_pcache *pcp;
    struct seg_phash *hp;

    if (seg_plazy == 0) {
        (void) (*callback)(seg, addr, len, pp, rw);
        return;
    }
    hp = &p_hashtab[p_hash(seg)];
    mutex_enter(&hp->p_hmutex);
    for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
        pcp = pcp->p_hnext) {
        if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
            pcp->p_active--;
            ASSERT(pcp->p_active >= 0);
            if (pcp->p_active == 0 && seg_pdisable) {
                int npages;

                ASSERT(callback == pcp->p_callback);
                /* free the entry */
                hp->p_qlen--;
                pcp->p_hprev->p_hnext = pcp->p_hnext;
                pcp->p_hnext->p_hprev = pcp->p_hprev;
                mutex_exit(&hp->p_hmutex);
                npages = pcp->p_len >> PAGESHIFT;
                mutex_enter(&seg_pmem);
                seg_plocked -= npages;
                if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
                    seg_plocked_window -= npages;
                }
                mutex_exit(&seg_pmem);
                kmem_free(pcp, sizeof (struct seg_pcache));
                goto out;
            }
            pcp->p_ref = 1;
            mutex_exit(&hp->p_hmutex);
            return;
        }
    }
    mutex_exit(&hp->p_hmutex);
out:
    (void) (*callback)(seg, addr, len, pp, rw);
}

/*
 * The seg_pinsert_check() is used by segment drivers to predict whether
 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
 */

int
seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
{
    struct seg_phash *hp;

    if (seg_plazy == 0) {
        return (SEGP_FAIL);
    }
    if (seg_pdisable != 0) {
        return (SEGP_FAIL);
    }
    ASSERT((len & PAGEOFFSET) == 0);
    hp = &p_hashtab[p_hash(seg)];
    if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
        return (SEGP_FAIL);
    }
    /*
     * If the SEGP_FORCE_WIRED flag is set,
     * we skip the check for seg_pwindow.
     */
    if ((flags & SEGP_FORCE_WIRED) == 0) {
        pgcnt_t npages;

        npages = len >> PAGESHIFT;
        if ((seg_plocked_window + npages) > seg_pwindow) {
            return (SEGP_FAIL);
        }
    }
    return (SEGP_SUCCESS);
}


/*
 * insert address range with shadow list into pagelock cache. If
 * the cache is off or caching is temporarily disabled or the allowed
 * 'window' is exceeded - return SEGP_FAIL. Otherwise return
 * SEGP_SUCCESS.
 */
int
seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
    enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
    size_t, struct page **, enum seg_rw))
{
    struct seg_pcache *pcp;
    struct seg_phash *hp;
    pgcnt_t npages;

    if (seg_plazy == 0) {
        return (SEGP_FAIL);
    }
    if (seg_pdisable != 0) {
        return (SEGP_FAIL);
    }
    ASSERT((len & PAGEOFFSET) == 0);
    hp = &p_hashtab[p_hash(seg)];
    if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
        return (SEGP_FAIL);
    }
    npages = len >> PAGESHIFT;
    mutex_enter(&seg_pmem);
    /*
     * If the SEGP_FORCE_WIRED flag is set,
     * we skip the check for seg_pwindow.
     */
    if ((flags & SEGP_FORCE_WIRED) == 0) {
        seg_plocked_window += npages;
        if (seg_plocked_window > seg_pwindow) {
            seg_plocked_window -= npages;
            mutex_exit(&seg_pmem);
            return (SEGP_FAIL);
        }
    }
    seg_plocked += npages;
    mutex_exit(&seg_pmem);

    pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
    pcp->p_seg = seg;
    pcp->p_addr = addr;
    pcp->p_len = len;
    pcp->p_pp = pp;
    pcp->p_rw = rw;
    pcp->p_callback = callback;
    pcp->p_active = 1;
    pcp->p_flags = flags;

    PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
        (void *)seg, (void *)addr, len, (void *)pp);

    hp = &p_hashtab[p_hash(seg)];
    mutex_enter(&hp->p_hmutex);
    hp->p_qlen++;
    pcp->p_hnext = hp->p_hnext;
    pcp->p_hprev = (struct seg_pcache *)hp;
    hp->p_hnext->p_hprev = pcp;
    hp->p_hnext = pcp;
    mutex_exit(&hp->p_hmutex);
    return (SEGP_SUCCESS);
}

/*
 * purge all entries from the pagelock cache if not active
 * and not recently used. Drop all locks and call through
 * the address space into the segment driver to reclaim
 * the pages. This makes sure we get the address space
 * and segment driver locking right.
 */
static void
seg_ppurge_all(int force)
{
    struct seg_pcache *delcallb_list = NULL;
    struct seg_pcache *pcp;
    struct seg_phash *hp;
    int purge_count = 0;
    pgcnt_t npages = 0;
    pgcnt_t npages_window = 0;

    /*
     * if the cache if off or empty, return
     */
    if (seg_plazy == 0 || seg_plocked == 0) {
        return;
    }
    for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
        mutex_enter(&hp->p_hmutex);
        pcp = hp->p_hnext;

        /*
         * While 'force' is set, seg_pasync_thread is not
         * throttled.  This is to speedup flushing of seg_pcache
         * in preparation for DR.
         *
         * In normal case, when 'force' is not set, we throttle
         * seg_pasync_thread so that we don't spend all the time
         * time in purging the cache.
         */
        while ((pcp != (struct seg_pcache *)hp) &&
                (force || (purge_count <= seg_ppcount))) {

            /*
             * purge entries which are not active and
             * have not been used recently and
             * have the SEGP_ASYNC_FLUSH flag.
             *
             * In the 'force' case, we ignore the
             * SEGP_ASYNC_FLUSH flag.
             */
            if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
                pcp->p_ref = 1;
            if (force)
                pcp->p_ref = 0;
            if (!pcp->p_ref && !pcp->p_active) {
                struct as *as = pcp->p_seg->s_as;

                /*
                 * try to get the readers lock on the address
                 * space before taking out the cache element.
                 * This ensures as_pagereclaim() can actually
                 * call through the address space and free
                 * the pages. If we don't get the lock, just
                 * skip this entry. The pages will be reclaimed
                 * by the segment driver at unmap time.
                 */
                if (AS_LOCK_TRYENTER(as, &as->a_lock,
                    RW_READER)) {
                    hp->p_qlen--;
                    pcp->p_hprev->p_hnext = pcp->p_hnext;
                    pcp->p_hnext->p_hprev = pcp->p_hprev;
                    pcp->p_hprev = delcallb_list;
                    delcallb_list = pcp;
                    purge_count++;
                }
            } else {
                pcp->p_ref = 0;
            }
            pcp = pcp->p_hnext;
        }
        mutex_exit(&hp->p_hmutex);
        if (!force && purge_count > seg_ppcount)
            break;
    }

    /*
     * run the delayed callback list. We don't want to hold the
     * cache lock during a call through the address space.
     */
    while (delcallb_list != NULL) {
        struct as *as;

        pcp = delcallb_list;
        delcallb_list = pcp->p_hprev;
        as = pcp->p_seg->s_as;

        PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
            "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
            pcp->p_len, (void *)pcp->p_pp);

        as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
            pcp->p_len, pcp->p_rw);
        AS_LOCK_EXIT(as, &as->a_lock);
        npages += pcp->p_len >> PAGESHIFT;
        if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
            npages_window += pcp->p_len >> PAGESHIFT;
        }
        kmem_free(pcp, sizeof (struct seg_pcache));
    }
    mutex_enter(&seg_pmem);
    seg_plocked -= npages;
    seg_plocked_window -= npages_window;
    mutex_exit(&seg_pmem);
}

/*
 * Remove cached pages for segment(s) entries from hashtable.
 * The segments are identified by a given clients callback
 * function.
 * This is useful for multiple seg's cached on behalf of
 * dummy segment (ISM/DISM) with common callback function.
 * The clients callback function may return status indicating
 * that the last seg's entry has been purged. In such a case
 * the seg_ppurge_seg() stops searching hashtable and exits.
 * Otherwise all hashtable entries are scanned.
 */
void
seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
    struct page **, enum seg_rw))
{
    struct seg_pcache *pcp, *npcp;
    struct seg_phash *hp;
    pgcnt_t npages = 0;
    pgcnt_t npages_window = 0;
    int done = 0;

    /*
     * if the cache if off or empty, return
     */
    if (seg_plazy == 0 || seg_plocked == 0) {
        return;
    }
    mutex_enter(&seg_pcache);
    seg_pdisable++;
    mutex_exit(&seg_pcache);

    for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {

        mutex_enter(&hp->p_hmutex);
        pcp = hp->p_hnext;
        while (pcp != (struct seg_pcache *)hp) {

            /*
             * purge entries which are not active
             */
            npcp = pcp->p_hnext;
            if (!pcp->p_active && pcp->p_callback == callback) {
                hp->p_qlen--;
                pcp->p_hprev->p_hnext = pcp->p_hnext;
                pcp->p_hnext->p_hprev = pcp->p_hprev;

                if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
                    pcp->p_len, pcp->p_pp, pcp->p_rw)) {
                    done = 1;
                }

                npages += pcp->p_len >> PAGESHIFT;
                if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
                    npages_window +=
                        pcp->p_len >> PAGESHIFT;
                }
                kmem_free(pcp, sizeof (struct seg_pcache));
            }
            pcp = npcp;
            if (done)
                break;
        }
        mutex_exit(&hp->p_hmutex);
        if (done)
            break;
    }

    mutex_enter(&seg_pcache);
    seg_pdisable--;
    mutex_exit(&seg_pcache);

    mutex_enter(&seg_pmem);
    seg_plocked -= npages;
    seg_plocked_window -= npages_window;
    mutex_exit(&seg_pmem);
}

/*
 * purge all entries for a given segment. Since we
 * callback into the segment driver directly for page
 * reclaim the caller needs to hold the right locks.
 */
void
seg_ppurge(struct seg *seg)
{
    struct seg_pcache *delcallb_list = NULL;
    struct seg_pcache *pcp;
    struct seg_phash *hp;
    pgcnt_t npages = 0;
    pgcnt_t npages_window = 0;

    if (seg_plazy == 0) {
        return;
    }
    hp = &p_hashtab[p_hash(seg)];
    mutex_enter(&hp->p_hmutex);
    pcp = hp->p_hnext;
    while (pcp != (struct seg_pcache *)hp) {
        if (pcp->p_seg == seg) {
            if (pcp->p_active) {
                break;
            }
            hp->p_qlen--;
            pcp->p_hprev->p_hnext = pcp->p_hnext;
            pcp->p_hnext->p_hprev = pcp->p_hprev;
            pcp->p_hprev = delcallb_list;
            delcallb_list = pcp;
        }
        pcp = pcp->p_hnext;
    }
    mutex_exit(&hp->p_hmutex);
    while (delcallb_list != NULL) {
        pcp = delcallb_list;
        delcallb_list = pcp->p_hprev;

        PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
            "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
            pcp->p_len, (void *)pcp->p_pp);

        ASSERT(seg == pcp->p_seg);
        (void) (*pcp->p_callback)(seg, pcp->p_addr,
            pcp->p_len, pcp->p_pp, pcp->p_rw);
        npages += pcp->p_len >> PAGESHIFT;
        if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
            npages_window += pcp->p_len >> PAGESHIFT;
        }
        kmem_free(pcp, sizeof (struct seg_pcache));
    }
    mutex_enter(&seg_pmem);
    seg_plocked -= npages;
    seg_plocked_window -= npages_window;
    mutex_exit(&seg_pmem);
}

static void seg_pinit_mem_config(void);

/*
 * setup the pagelock cache
 */
static void
seg_pinit(void)
{
    struct seg_phash *hp;
    int i;
    uint_t physmegs;

    sema_init(&seg_psaync_sem, 0, NULL, SEMA_DEFAULT, NULL);

    mutex_enter(&seg_pcache);
    if (p_hashtab == NULL) {
        physmegs = physmem >> (20 - PAGESHIFT);

        /* If p_hashsize was not set in /etc/system ... */
        if (p_hashsize == 0) {
            /*
             * Choose p_hashsize based on physmem.
             */
            if (physmegs < 64) {
                p_hashsize = 64;
            } else if (physmegs < 1024) {
                p_hashsize = 1024;
            } else if (physmegs < 10 * 1024) {
                p_hashsize = 8192;
            } else if (physmegs < 20 * 1024) {
                p_hashsize = 2 * 8192;
                seg_pmaxqlen = 16;
            } else {
                p_hashsize = 128 * 1024;
                seg_pmaxqlen = 128;
            }
        }

        p_hashtab = kmem_zalloc(
            p_hashsize * sizeof (struct seg_phash), KM_SLEEP);
        for (i = 0; i < p_hashsize; i++) {
            hp = (struct seg_phash *)&p_hashtab[i];
            hp->p_hnext = (struct seg_pcache *)hp;
            hp->p_hprev = (struct seg_pcache *)hp;
            mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
        }
        if (seg_pwindow == 0) {
            if (physmegs < 24) {
                /* don't use cache */
                seg_plazy = 0;
            } else if (physmegs < 64) {
                seg_pwindow = physmem >> 5; /* 3% of memory */
            } else if (physmegs < 10 * 1024) {
                seg_pwindow = physmem >> 3; /* 12% of memory */
            } else {
                seg_pwindow = physmem >> 1;
            }
        }
    }
    mutex_exit(&seg_pcache);

    seg_pinit_mem_config();
}

/*
 * called by pageout if memory is low
 */
void
seg_preap(void)
{
    /*
     * if the cache if off or empty, return
     */
    if (seg_plocked == 0 || seg_plazy == 0) {
        return;
    }
    sema_v(&seg_psaync_sem);
}

static void seg_pupdate(void *);

/*
 * run as a backgroud thread and reclaim pagelock
 * pages which have not been used recently
 */
void
seg_pasync_thread(void)
{
    callb_cpr_t cpr_info;
    kmutex_t pasync_lock;   /* just for CPR stuff */

    mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);

    CALLB_CPR_INIT(&cpr_info, &pasync_lock,
        callb_generic_cpr, "seg_pasync");

    if (seg_preap_interval == 0) {
        seg_preap_interval = seg_preap_time * hz;
    } else {
        seg_preap_interval *= hz;
    }
    if (seg_plazy && seg_pupdate_active) {
        (void) timeout(seg_pupdate, NULL, seg_preap_interval);
    }

    for (;;) {
        mutex_enter(&pasync_lock);
        CALLB_CPR_SAFE_BEGIN(&cpr_info);
        mutex_exit(&pasync_lock);
        sema_p(&seg_psaync_sem);
        mutex_enter(&pasync_lock);
        CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
        mutex_exit(&pasync_lock);

        seg_ppurge_all(0);
    }
}

static void
seg_pupdate(void *dummy)
{
    sema_v(&seg_psaync_sem);

    if (seg_plazy && seg_pupdate_active) {
        (void) timeout(seg_pupdate, dummy, seg_preap_interval);
    }
}

static struct kmem_cache *seg_cache;

/*
 * Initialize segment management data structures.
 */
void
seg_init(void)
{
    kstat_t *ksp;

    seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
        0, NULL, NULL, NULL, NULL, NULL, 0);

    ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
        segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
    if (ksp) {
        ksp->ks_data = (void *)segadvstat_ptr;
        kstat_install(ksp);
    }

    seg_pinit();
}

/*
 * Allocate a segment to cover [base, base+size]
 * and attach it to the specified address space.
 */
struct seg *
seg_alloc(struct as *as, caddr_t base, size_t size)
{
    struct seg *new;
    caddr_t segbase;
    size_t segsize;

    segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
    segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
        (uintptr_t)segbase;

    if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
        return ((struct seg *)NULL);    /* bad virtual addr range */

    if (as != &kas &&
        valid_usr_range(segbase, segsize, 0, as,
        as->a_userlimit) != RANGE_OKAY)
        return ((struct seg *)NULL);    /* bad virtual addr range */

    new = kmem_cache_alloc(seg_cache, KM_SLEEP);
    new->s_ops = NULL;
    new->s_data = NULL;
    new->s_szc = 0;
    new->s_flags = 0;
    if (seg_attach(as, segbase, segsize, new) < 0) {
        kmem_cache_free(seg_cache, new);
        return ((struct seg *)NULL);
    }
    /* caller must fill in ops, data */
    return (new);
}

/*
 * Attach a segment to the address space.  Used by seg_alloc()
 * and for kernel startup to attach to static segments.
 */
int
seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
{
    seg->s_as = as;
    seg->s_base = base;
    seg->s_size = size;

    /*
     * as_addseg() will add the segment at the appropraite point
     * in the list. It will return -1 if there is overlap with
     * an already existing segment.
     */
    return (as_addseg(as, seg));
}

/*
 * Unmap a segment and free it from its associated address space.
 * This should be called by anybody who's finished with a whole segment's
 * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
 * responsibility of the segment driver to unlink the the segment
 * from the address space, and to free public and private data structures
 * associated with the segment.  (This is typically done by a call to
 * seg_free()).
 */
void
seg_unmap(struct seg *seg)
{
#ifdef DEBUG
    int ret;
#endif /* DEBUG */

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

    /* Shouldn't have called seg_unmap if mapping isn't yet established */
    ASSERT(seg->s_data != NULL);

    /* Unmap the whole mapping */
#ifdef DEBUG
    ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
    ASSERT(ret == 0);
#else
    SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
#endif /* DEBUG */
}

/*
 * Free the segment from its associated as. This should only be called
 * if a mapping to the segment has not yet been established (e.g., if
 * an error occurs in the middle of doing an as_map when the segment
 * has already been partially set up) or if it has already been deleted
 * (e.g., from a segment driver unmap routine if the unmap applies to the
 * entire segment). If the mapping is currently set up then seg_unmap() should
 * be called instead.
 */
void
seg_free(struct seg *seg)
{
    register struct as *as = seg->s_as;
    struct seg *tseg = as_removeseg(as, seg);

    ASSERT(tseg == seg);

    /*
     * If the segment private data field is NULL,
     * then segment driver is not attached yet.
     */
    if (seg->s_data != NULL)
        SEGOP_FREE(seg);

    kmem_cache_free(seg_cache, seg);
}

/*ARGSUSED*/
static void
seg_p_mem_config_post_add(
    void *arg,
    pgcnt_t delta_pages)
{
    /* Nothing to do. */
}

/*
 * Attempt to purge seg_pcache.  May need to return before this has
 * completed to allow other pre_del callbacks to unlock pages. This is
 * ok because:
 *  1) The seg_pdisable flag has been set so at least we won't
 *  cache anymore locks and the locks we couldn't purge
 *  will not be held if they do get released by a subsequent
 *  pre-delete callback.
 *
 *  2) The rest of the memory delete thread processing does not
 *  depend on the changes made in this pre-delete callback. No
 *  panics will result, the worst that will happen is that the
 *  DR code will timeout and cancel the delete.
 */
/*ARGSUSED*/
static int
seg_p_mem_config_pre_del(
    void *arg,
    pgcnt_t delta_pages)
{
    pgcnt_t old_plocked;
    int stall_count = 0;

    mutex_enter(&seg_pcache);
    seg_pdisable++;
    ASSERT(seg_pdisable != 0);
    mutex_exit(&seg_pcache);

    /*
     * Attempt to empty the cache. Terminate if seg_plocked does not
     * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
     */
    while (seg_plocked != 0) {
        old_plocked = seg_plocked;
        seg_ppurge_all(1);
        if (seg_plocked == old_plocked) {
            if (stall_count++ > SEGP_STALL_THRESHOLD) {
                cmn_err(CE_NOTE, "!Pre-delete couldn't purge"
                    " pagelock cache - continuing");
                break;
            }
        } else
            stall_count = 0;
        if (seg_plocked != 0)
            delay(hz/SEGP_PREDEL_DELAY_FACTOR);
    }
    return (0);
}

/*ARGSUSED*/
static void
seg_p_mem_config_post_del(
    void *arg,
    pgcnt_t delta_pages,
    int cancelled)
{
    mutex_enter(&seg_pcache);
    ASSERT(seg_pdisable != 0);
    seg_pdisable--;
    mutex_exit(&seg_pcache);
}

static kphysm_setup_vector_t seg_p_mem_config_vec = {
    KPHYSM_SETUP_VECTOR_VERSION,
    seg_p_mem_config_post_add,
    seg_p_mem_config_pre_del,
    seg_p_mem_config_post_del,
};

static void
seg_pinit_mem_config(void)
{
    int ret;

    ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
    /*
     * Want to catch this in the debug kernel. At run time, if the
     * callbacks don't get run all will be OK as the disable just makes
     * it more likely that the pages can be collected.
     */
    ASSERT(ret == 0);
}

extern struct seg_ops segvn_ops;
extern struct seg_ops segspt_shmops;

/*
 * Verify that segment is not a shared anonymous segment which reserves
 * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
 * from one zone to another if any segments are shared.  This is because the
 * last process to exit will credit the swap reservation.  This could lead
 * to the swap being reserved by one zone, and credited to another.
 */
boolean_t
seg_can_change_zones(struct seg *seg)
{
    struct segvn_data *svd;

    if (seg->s_ops == &segspt_shmops)
        return (B_FALSE);

    if (seg->s_ops == &segvn_ops) {
        svd = (struct segvn_data *)seg->s_data;
        if (svd->type == MAP_SHARED &&
            svd->amp != NULL &&
            svd->amp->swresv > 0)
        return (B_FALSE);
    }
    return (B_TRUE);
}

/*
 * Return swap reserved by a segment backing a private mapping.
 */
size_t
seg_swresv(struct seg *seg)
{
    struct segvn_data *svd;
    size_t swap = 0;

    if (seg->s_ops == &segvn_ops) {
        svd = (struct segvn_data *)seg->s_data;
        if (svd->type == MAP_PRIVATE && svd->swresv > 0)
            swap = svd->swresv;
    }
    return (swap);
}