common/vm/seg_spt.c

	seg_spt.c revision 1bd5c35fd400f7f19eee9efd795c32cedb602b06
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/param.h>
#include <sys/user.h>
#include <sys/mman.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/cmn_err.h>
#include <sys/systm.h>
#include <sys/tuneable.h>
#include <vm/hat.h>
#include <vm/seg.h>
#include <vm/as.h>
#include <vm/anon.h>
#include <vm/page.h>
#include <sys/buf.h>
#include <sys/swap.h>
#include <sys/atomic.h>
#include <vm/seg_spt.h>
#include <sys/debug.h>
#include <sys/vtrace.h>
#include <sys/shm.h>
#include <sys/lgrp.h>
#include <sys/vmsystm.h>

#include <sys/tnf_probe.h>

#define SEGSPTADDR  (caddr_t)0x0

/*
 * # pages used for spt
 */
static size_t   spt_used;

/*
 * segspt_minfree is the memory left for system after ISM
 * locked its pages; it is set up to 5% of availrmem in
 * sptcreate when ISM is created.  ISM should not use more
 * than ~90% of availrmem; if it does, then the performance
 * of the system may decrease. Machines with large memories may
 * be able to use up more memory for ISM so we set the default
 * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
 * If somebody wants even more memory for ISM (risking hanging
 * the system) they can patch the segspt_minfree to smaller number.
 */
pgcnt_t segspt_minfree = 0;

static int segspt_create(struct seg *seg, caddr_t argsp);
static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
static void segspt_free(struct seg *seg);
static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);

static void
segspt_badop()
{
    panic("segspt_badop called");
    /*NOTREACHED*/
}

#define SEGSPT_BADOP(t) (t(*)())segspt_badop

struct seg_ops segspt_ops = {
    SEGSPT_BADOP(int),      /* dup */
    segspt_unmap,
    segspt_free,
    SEGSPT_BADOP(int),      /* fault */
    SEGSPT_BADOP(faultcode_t),  /* faulta */
    SEGSPT_BADOP(int),      /* setprot */
    SEGSPT_BADOP(int),      /* checkprot */
    SEGSPT_BADOP(int),      /* kluster */
    SEGSPT_BADOP(size_t),       /* swapout */
    SEGSPT_BADOP(int),      /* sync */
    SEGSPT_BADOP(size_t),       /* incore */
    SEGSPT_BADOP(int),      /* lockop */
    SEGSPT_BADOP(int),      /* getprot */
    SEGSPT_BADOP(u_offset_t),   /* getoffset */
    SEGSPT_BADOP(int),      /* gettype */
    SEGSPT_BADOP(int),      /* getvp */
    SEGSPT_BADOP(int),      /* advise */
    SEGSPT_BADOP(void),     /* dump */
    SEGSPT_BADOP(int),      /* pagelock */
    SEGSPT_BADOP(int),      /* setpgsz */
    SEGSPT_BADOP(int),      /* getmemid */
    segspt_getpolicy,       /* getpolicy */
    SEGSPT_BADOP(int),      /* capable */
};

static int segspt_shmdup(struct seg *seg, struct seg *newseg);
static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
static void segspt_shmfree(struct seg *seg);
static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
        caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
            register size_t len, register uint_t prot);
static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
            uint_t prot);
static int  segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
static size_t   segspt_shmswapout(struct seg *seg);
static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
            register char *vec);
static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
            int attr, uint_t flags);
static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
            int attr, int op, ulong_t *lockmap, size_t pos);
static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
            uint_t *protv);
static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
static int segspt_shmgettype(struct seg *seg, caddr_t addr);
static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
            uint_t behav);
static void segspt_shmdump(struct seg *seg);
static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
            struct page ***, enum lock_type, enum seg_rw);
static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
static int segspt_shmcapable(struct seg *, segcapability_t);

struct seg_ops segspt_shmops = {
    segspt_shmdup,
    segspt_shmunmap,
    segspt_shmfree,
    segspt_shmfault,
    segspt_shmfaulta,
    segspt_shmsetprot,
    segspt_shmcheckprot,
    segspt_shmkluster,
    segspt_shmswapout,
    segspt_shmsync,
    segspt_shmincore,
    segspt_shmlockop,
    segspt_shmgetprot,
    segspt_shmgetoffset,
    segspt_shmgettype,
    segspt_shmgetvp,
    segspt_shmadvise,   /* advise */
    segspt_shmdump,
    segspt_shmpagelock,
    segspt_shmsetpgsz,
    segspt_shmgetmemid,
    segspt_shmgetpolicy,
    segspt_shmcapable,
};

static void segspt_purge(struct seg *seg);
static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
        enum seg_rw);
static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
        page_t **ppa);


/*ARGSUSED*/
int
sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
    uint_t prot, uint_t flags, uint_t share_szc)
{
    int     err;
    struct  as  *newas;
    struct  segspt_crargs sptcargs;

#ifdef DEBUG
    TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
                    tnf_ulong, size, size );
#endif
    if (segspt_minfree == 0)    /* leave min 5% of availrmem for */
        segspt_minfree = availrmem/20;  /* for the system */

    if (!hat_supported(HAT_SHARED_PT, (void *)0))
        return (EINVAL);

    /*
     * get a new as for this shared memory segment
     */
    newas = as_alloc();
    sptcargs.amp = amp;
    sptcargs.prot = prot;
    sptcargs.flags = flags;
    sptcargs.szc = share_szc;

    /*
     * create a shared page table (spt) segment
     */

    if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
        as_free(newas);
        return (err);
    }
    *sptseg = sptcargs.seg_spt;
    return (0);
}

void
sptdestroy(struct as *as, struct anon_map *amp)
{

#ifdef DEBUG
    TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
#endif
    (void) as_unmap(as, SEGSPTADDR, amp->size);
    as_free(as);
}

/*
 * called from seg_free().
 * free (i.e., unlock, unmap, return to free list)
 *  all the pages in the given seg.
 */
void
segspt_free(struct seg  *seg)
{
    struct spt_data *sptd = (struct spt_data *)seg->s_data;

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

    if (sptd != NULL) {
        if (sptd->spt_realsize)
            segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);

        if (sptd->spt_ppa_lckcnt)
            kmem_free(sptd->spt_ppa_lckcnt,
                sizeof (*sptd->spt_ppa_lckcnt)
                * btopr(sptd->spt_amp->size));
        kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
        mutex_destroy(&sptd->spt_lock);
        kmem_free(sptd, sizeof (*sptd));
    }
}

/*ARGSUSED*/
static int
segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
    uint_t flags)
{
    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    return (0);
}

/*ARGSUSED*/
static size_t
segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
{
    caddr_t eo_seg;
    pgcnt_t npages;
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct seg  *sptseg;
    struct spt_data *sptd;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
#ifdef lint
    seg = seg;
#endif
    sptseg = shmd->shm_sptseg;
    sptd = sptseg->s_data;

    if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
        eo_seg = addr + len;
        while (addr < eo_seg) {
            /* page exists, and it's locked. */
            *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
                SEG_PAGE_ANON;
            addr += PAGESIZE;
        }
        return (len);
    } else {
        struct  anon_map *amp = shmd->shm_amp;
        struct  anon    *ap;
        page_t      *pp;
        pgcnt_t     anon_index;
        struct vnode    *vp;
        u_offset_t  off;
        ulong_t     i;
        int     ret;
        anon_sync_obj_t cookie;

        addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
        anon_index = seg_page(seg, addr);
        npages = btopr(len);
        if (anon_index + npages > btopr(shmd->shm_amp->size)) {
            return (EINVAL);
        }
        ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
        for (i = 0; i < npages; i++, anon_index++) {
            ret = 0;
            anon_array_enter(amp, anon_index, &cookie);
            ap = anon_get_ptr(amp->ahp, anon_index);
            if (ap != NULL) {
                swap_xlate(ap, &vp, &off);
                anon_array_exit(&cookie);
                pp = page_lookup_nowait(vp, off, SE_SHARED);
                if (pp != NULL) {
                    ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
                    page_unlock(pp);
                }
            } else {
                anon_array_exit(&cookie);
            }
            if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
                ret |= SEG_PAGE_LOCKED;
            }
            *vec++ = (char)ret;
        }
        ANON_LOCK_EXIT(&amp->a_rwlock);
        return (len);
    }
}

static int
segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
{
    size_t share_size;

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * seg.s_size may have been rounded up to the largest page size
     * in shmat().
     * XXX This should be cleanedup. sptdestroy should take a length
     * argument which should be the same as sptcreate. Then
     * this rounding would not be needed (or is done in shm.c)
     * Only the check for full segment will be needed.
     *
     * XXX -- shouldn't raddr == 0 always? These tests don't seem
     * to be useful at all.
     */
    share_size = page_get_pagesize(seg->s_szc);
    ssize = P2ROUNDUP(ssize, share_size);

    if (raddr == seg->s_base && ssize == seg->s_size) {
        seg_free(seg);
        return (0);
    } else
        return (EINVAL);
}

int
segspt_create(struct seg *seg, caddr_t argsp)
{
    int     err;
    caddr_t     addr = seg->s_base;
    struct spt_data *sptd;
    struct  segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
    struct anon_map *amp = sptcargs->amp;
    struct  cred    *cred = CRED();
    ulong_t     i, j, anon_index = 0;
    pgcnt_t     npages = btopr(amp->size);
    struct vnode    *vp;
    page_t      **ppa;
    uint_t      hat_flags;

    /*
     * We are holding the a_lock on the underlying dummy as,
     * so we can make calls to the HAT layer.
     */
    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

#ifdef DEBUG
    TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
                                tnf_opaque, addr, addr,
                tnf_ulong, len, seg->s_size);
#endif
    if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
        if (err = anon_swap_adjust(npages))
            return (err);
    }
    err = ENOMEM;

    if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
        goto out1;

    if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
        if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
            KM_NOSLEEP)) == NULL)
            goto out2;
    }

    mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);

    if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
        goto out3;

    seg->s_ops = &segspt_ops;
    sptd->spt_vp = vp;
    sptd->spt_amp = amp;
    sptd->spt_prot = sptcargs->prot;
    sptd->spt_flags = sptcargs->flags;
    seg->s_data = (caddr_t)sptd;
    sptd->spt_ppa = NULL;
    sptd->spt_ppa_lckcnt = NULL;
    seg->s_szc = sptcargs->szc;

    ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    amp->a_szc = seg->s_szc;
    ANON_LOCK_EXIT(&amp->a_rwlock);

    /*
     * Set policy to affect initial allocation of pages in
     * anon_map_createpages()
     */
    (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
        NULL, 0, ptob(npages));

    if (sptcargs->flags & SHM_PAGEABLE) {
        size_t  share_sz;
        pgcnt_t new_npgs, more_pgs;
        struct anon_hdr *nahp;

        share_sz = page_get_pagesize(seg->s_szc);
        if (!IS_P2ALIGNED(amp->size, share_sz)) {
            /*
             * We are rounding up the size of the anon array
             * on 4 M boundary because we always create 4 M
             * of page(s) when locking, faulting pages and we
             * don't have to check for all corner cases e.g.
             * if there is enough space to allocate 4 M
             * page.
             */
            new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
            more_pgs = new_npgs - npages;

            if (anon_resv(ptob(more_pgs)) == 0) {
                err = ENOMEM;
                goto out4;
            }
            nahp = anon_create(new_npgs, ANON_SLEEP);
            ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
            (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
                ANON_SLEEP);
            anon_release(amp->ahp, npages);
            amp->ahp = nahp;
            amp->swresv = amp->size = ptob(new_npgs);
            ANON_LOCK_EXIT(&amp->a_rwlock);
            npages = new_npgs;
        }

        sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
            sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
        sptd->spt_pcachecnt = 0;
        sptd->spt_realsize = ptob(npages);
        sptcargs->seg_spt = seg;
        return (0);
    }

    /*
     * get array of pages for each anon slot in amp
     */
    if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
        seg, addr, S_CREATE, cred)) != 0)
        goto out4;

    /*
     * addr is initial address corresponding to the first page on ppa list
     */
    for (i = 0; i < npages; i++) {
        /* attempt to lock all pages */
        if (!page_pp_lock(ppa[i], 0, 1)) {
            /*
             * if unable to lock any page, unlock all
             * of them and return error
             */
            for (j = 0; j < i; j++)
                page_pp_unlock(ppa[j], 0, 1);
            for (i = 0; i < npages; i++) {
                page_unlock(ppa[i]);
            }
            err = ENOMEM;
            goto out4;
        }
    }

    /*
     * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
     * for the entire life of the segment. For example platforms
     * that do not support Dynamic Reconfiguration.
     */
    hat_flags = HAT_LOAD_SHARE;
    if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
        hat_flags |= HAT_LOAD_LOCK;

    hat_memload_array(seg->s_as->a_hat, addr, ptob(npages),
        ppa, sptd->spt_prot, hat_flags);

    /*
     * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
     * we will leave the pages locked SE_SHARED for the life
     * of the ISM segment. This will prevent any calls to
     * hat_pageunload() on this ISM segment for those platforms.
     */
    if (!(hat_flags & HAT_LOAD_LOCK)) {
        /*
         * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
         * we no longer need to hold the SE_SHARED lock on the pages,
         * since L_PAGELOCK and F_SOFTLOCK calls will grab the
         * SE_SHARED lock on the pages as necessary.
         */
        for (i = 0; i < npages; i++)
            page_unlock(ppa[i]);
    }
    sptd->spt_pcachecnt = 0;
    kmem_free(ppa, ((sizeof (page_t *)) * npages));
    sptd->spt_realsize = ptob(npages);
    atomic_add_long(&spt_used, npages);
    sptcargs->seg_spt = seg;
    return (0);

out4:
    seg->s_data = NULL;
    kmem_free(vp, sizeof (*vp));
out3:
    mutex_destroy(&sptd->spt_lock);
    if ((sptcargs->flags & SHM_PAGEABLE) == 0)
        kmem_free(ppa, (sizeof (*ppa) * npages));
out2:
    kmem_free(sptd, sizeof (*sptd));
out1:
    if ((sptcargs->flags & SHM_PAGEABLE) == 0)
        anon_swap_restore(npages);
    return (err);
}

/*ARGSUSED*/
void
segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
{
    struct page     *pp;
    struct spt_data *sptd = (struct spt_data *)seg->s_data;
    pgcnt_t     npages;
    ulong_t     anon_idx;
    struct anon_map *amp;
    struct anon     *ap;
    struct vnode    *vp;
    u_offset_t  off;
    uint_t      hat_flags;
    int     root = 0;
    pgcnt_t     pgs, curnpgs = 0;
    page_t      *rootpp;

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

    len = P2ROUNDUP(len, PAGESIZE);

    npages = btop(len);

    hat_flags = HAT_UNLOAD_UNLOCK;
    if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
        (sptd->spt_flags & SHM_PAGEABLE)) {
        hat_flags = HAT_UNLOAD;
    }

    hat_unload(seg->s_as->a_hat, addr, len, hat_flags);

    amp = sptd->spt_amp;
    if (sptd->spt_flags & SHM_PAGEABLE)
        npages = btop(amp->size);

    ASSERT(amp);
    for (anon_idx = 0; anon_idx < npages; anon_idx++) {
        if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
            if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
                panic("segspt_free_pages: null app");
                /*NOTREACHED*/
            }
        } else {
            if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
                == NULL)
                continue;
        }
        ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
        swap_xlate(ap, &vp, &off);

        /*
         * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
         * the pages won't be having SE_SHARED lock at this
         * point.
         *
         * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
         * the pages are still held SE_SHARED locked from the
         * original segspt_create()
         *
         * Our goal is to get SE_EXCL lock on each page, remove
         * permanent lock on it and invalidate the page.
         */
        if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
            if (hat_flags == HAT_UNLOAD)
                pp = page_lookup(vp, off, SE_EXCL);
            else {
                if ((pp = page_find(vp, off)) == NULL) {
                    panic("segspt_free_pages: "
                        "page not locked");
                    /*NOTREACHED*/
                }
                if (!page_tryupgrade(pp)) {
                    page_unlock(pp);
                    pp = page_lookup(vp, off, SE_EXCL);
                }
            }
            if (pp == NULL) {
                panic("segspt_free_pages: "
                    "page not in the system");
                /*NOTREACHED*/
            }
            page_pp_unlock(pp, 0, 1);
        } else {
            if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
                continue;
            page_pp_unlock(pp, 0, 0);
        }
        /*
         * It's logical to invalidate the pages here as in most cases
         * these were created by segspt.
         */
        if (pp->p_szc != 0) {
            /*
             * For DISM swap is released in shm_rm_amp.
             */
            if ((sptd->spt_flags & SHM_PAGEABLE) == 0 &&
                ap->an_pvp != NULL) {
                panic("segspt_free_pages: pvp non NULL");
                /*NOTREACHED*/
            }
            if (root == 0) {
                ASSERT(curnpgs == 0);
                root = 1;
                rootpp = pp;
                pgs = curnpgs = page_get_pagecnt(pp->p_szc);
                ASSERT(pgs > 1);
                ASSERT(IS_P2ALIGNED(pgs, pgs));
                ASSERT(!(page_pptonum(pp) & (pgs - 1)));
                curnpgs--;
            } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
                ASSERT(curnpgs == 1);
                ASSERT(page_pptonum(pp) ==
                    page_pptonum(rootpp) + (pgs - 1));
                page_destroy_pages(rootpp);
                root = 0;
                curnpgs = 0;
            } else {
                ASSERT(curnpgs > 1);
                ASSERT(page_pptonum(pp) ==
                    page_pptonum(rootpp) + (pgs - curnpgs));
                curnpgs--;
            }
        } else {
            if (root != 0 || curnpgs != 0) {
                panic("segspt_free_pages: bad large page");
                /*NOTREACHED*/
            }
            /*LINTED: constant in conditional context */
            VN_DISPOSE(pp, B_INVAL, 0, kcred);
        }
    }

    if (root != 0 || curnpgs != 0) {
        panic("segspt_free_pages: bad large page");
        /*NOTREACHED*/
    }

    /*
     * mark that pages have been released
     */
    sptd->spt_realsize = 0;

    if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
        atomic_add_long(&spt_used, -npages);
        anon_swap_restore(npages);
    }
}

/*
 * Get memory allocation policy info for specified address in given segment
 */
static lgrp_mem_policy_info_t *
segspt_getpolicy(struct seg *seg, caddr_t addr)
{
    struct anon_map     *amp;
    ulong_t         anon_index;
    lgrp_mem_policy_info_t  *policy_info;
    struct spt_data     *spt_data;

    ASSERT(seg != NULL);

    /*
     * Get anon_map from segspt
     *
     * Assume that no lock needs to be held on anon_map, since
     * it should be protected by its reference count which must be
     * nonzero for an existing segment
     * Need to grab readers lock on policy tree though
     */
    spt_data = (struct spt_data *)seg->s_data;
    if (spt_data == NULL)
        return (NULL);
    amp = spt_data->spt_amp;
    ASSERT(amp->refcnt != 0);

    /*
     * Get policy info
     *
     * Assume starting anon index of 0
     */
    anon_index = seg_page(seg, addr);
    policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);

    return (policy_info);
}

/*
 * DISM only.
 * Return locked pages over a given range.
 *
 * We will cache all DISM locked pages and save the pplist for the
 * entire segment in the ppa field of the underlying DISM segment structure.
 * Later, during a call to segspt_reclaim() we will use this ppa array
 * to page_unlock() all of the pages and then we will free this ppa list.
 */
/*ARGSUSED*/
static int
segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
    struct page ***ppp, enum lock_type type, enum seg_rw rw)
{
    struct  shm_data *shmd = (struct shm_data *)seg->s_data;
    struct  seg *sptseg = shmd->shm_sptseg;
    struct  spt_data *sptd = sptseg->s_data;
    pgcnt_t pg_idx, npages, tot_npages, npgs;
    struct  page **pplist, **pl, **ppa, *pp;
    struct  anon_map *amp;
    spgcnt_t    an_idx;
    int     ret = ENOTSUP;
    uint_t  pl_built = 0;
    struct  anon *ap;
    struct  vnode *vp;
    u_offset_t off;
    pgcnt_t claim_availrmem = 0;
    uint_t  szc;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * We want to lock/unlock the entire ISM segment. Therefore,
     * we will be using the underlying sptseg and it's base address
     * and length for the caching arguments.
     */
    ASSERT(sptseg);
    ASSERT(sptd);

    pg_idx = seg_page(seg, addr);
    npages = btopr(len);

    /*
     * check if the request is larger than number of pages covered
     * by amp
     */
    if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
        *ppp = NULL;
        return (ENOTSUP);
    }

    if (type == L_PAGEUNLOCK) {
        ASSERT(sptd->spt_ppa != NULL);

        seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
            sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);

        /*
         * If someone is blocked while unmapping, we purge
         * segment page cache and thus reclaim pplist synchronously
         * without waiting for seg_pasync_thread. This speeds up
         * unmapping in cases where munmap(2) is called, while
         * raw async i/o is still in progress or where a thread
         * exits on data fault in a multithreaded application.
         */
        if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
            segspt_purge(seg);
        }
        return (0);
    } else if (type == L_PAGERECLAIM) {
        ASSERT(sptd->spt_ppa != NULL);
        (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
            sptd->spt_ppa, sptd->spt_prot);
        return (0);
    }

    if (sptd->spt_flags & DISM_PPA_CHANGED) {
        segspt_purge(seg);
        /*
         * for DISM ppa needs to be rebuild since
         * number of locked pages could be changed
         */
        *ppp = NULL;
        return (ENOTSUP);
    }

    /*
     * First try to find pages in segment page cache, without
     * holding the segment lock.
     */
    pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
        sptd->spt_prot);
    if (pplist != NULL) {
        ASSERT(sptd->spt_ppa != NULL);
        ASSERT(sptd->spt_ppa == pplist);
        ppa = sptd->spt_ppa;
        for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
            if (ppa[an_idx] == NULL) {
                seg_pinactive(seg, seg->s_base,
                    sptd->spt_amp->size, ppa,
                    sptd->spt_prot, segspt_reclaim);
                *ppp = NULL;
                return (ENOTSUP);
            }
            if ((szc = ppa[an_idx]->p_szc) != 0) {
                npgs = page_get_pagecnt(szc);
                an_idx = P2ROUNDUP(an_idx + 1, npgs);
            } else {
                an_idx++;
            }
        }
        /*
         * Since we cache the entire DISM segment, we want to
         * set ppp to point to the first slot that corresponds
         * to the requested addr, i.e. pg_idx.
         */
        *ppp = &(sptd->spt_ppa[pg_idx]);
        return (0);
    }

    /* The L_PAGELOCK case... */
    mutex_enter(&sptd->spt_lock);
    /*
     * try to find pages in segment page cache with mutex
     */
    pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
        sptd->spt_prot);
    if (pplist != NULL) {
        ASSERT(sptd->spt_ppa != NULL);
        ASSERT(sptd->spt_ppa == pplist);
        ppa = sptd->spt_ppa;
        for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
            if (ppa[an_idx] == NULL) {
                mutex_exit(&sptd->spt_lock);
                seg_pinactive(seg, seg->s_base,
                    sptd->spt_amp->size, ppa,
                    sptd->spt_prot, segspt_reclaim);
                *ppp = NULL;
                return (ENOTSUP);
            }
            if ((szc = ppa[an_idx]->p_szc) != 0) {
                npgs = page_get_pagecnt(szc);
                an_idx = P2ROUNDUP(an_idx + 1, npgs);
            } else {
                an_idx++;
            }
        }
        /*
         * Since we cache the entire DISM segment, we want to
         * set ppp to point to the first slot that corresponds
         * to the requested addr, i.e. pg_idx.
         */
        mutex_exit(&sptd->spt_lock);
        *ppp = &(sptd->spt_ppa[pg_idx]);
        return (0);
    }
    if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
        SEGP_FAIL) {
        mutex_exit(&sptd->spt_lock);
        *ppp = NULL;
        return (ENOTSUP);
    }

    /*
     * No need to worry about protections because DISM pages are always rw.
     */
    pl = pplist = NULL;
    amp = sptd->spt_amp;

    /*
     * Do we need to build the ppa array?
     */
    if (sptd->spt_ppa == NULL) {
        pgcnt_t lpg_cnt = 0;

        pl_built = 1;
        tot_npages = btopr(sptd->spt_amp->size);

        ASSERT(sptd->spt_pcachecnt == 0);
        pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
        pl = pplist;

        ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
        for (an_idx = 0; an_idx < tot_npages; ) {
            ap = anon_get_ptr(amp->ahp, an_idx);
            /*
             * Cache only mlocked pages. For large pages
             * if one (constituent) page is mlocked
             * all pages for that large page
             * are cached also. This is for quick
             * lookups of ppa array;
             */
            if ((ap != NULL) && (lpg_cnt != 0 ||
                (sptd->spt_ppa_lckcnt[an_idx] != 0))) {

                swap_xlate(ap, &vp, &off);
                pp = page_lookup(vp, off, SE_SHARED);
                ASSERT(pp != NULL);
                if (lpg_cnt == 0) {
                    npgs = page_get_pagecnt(pp->p_szc);
                    if (!IS_P2ALIGNED(an_idx, npgs)) {
                        an_idx = P2ALIGN(an_idx, npgs);
                        page_unlock(pp);
                        continue;
                    }
                }
                if (++lpg_cnt == npgs)
                    lpg_cnt = 0;

                /*
                 * availrmem is decremented only
                 * for unlocked pages
                 */
                if (sptd->spt_ppa_lckcnt[an_idx] == 0)
                    claim_availrmem++;
                pplist[an_idx] = pp;
            }
            an_idx++;
        }
        ANON_LOCK_EXIT(&amp->a_rwlock);

        mutex_enter(&freemem_lock);
        if (availrmem < tune.t_minarmem + claim_availrmem) {
            mutex_exit(&freemem_lock);
            ret = FC_MAKE_ERR(ENOMEM);
            claim_availrmem = 0;
            goto insert_fail;
        } else {
            availrmem -= claim_availrmem;
        }
        mutex_exit(&freemem_lock);

        sptd->spt_ppa = pl;
    } else {
        /*
         * We already have a valid ppa[].
         */
        pl = sptd->spt_ppa;
    }

    ASSERT(pl != NULL);

    ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
        pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH,
        segspt_reclaim);
    if (ret == SEGP_FAIL) {
        /*
         * seg_pinsert failed. We return
         * ENOTSUP, so that the as_pagelock() code will
         * then try the slower F_SOFTLOCK path.
         */
        sptd->spt_ppa = NULL;
        ret = ENOTSUP;
        goto insert_fail;
    }

    /*
     * In either case, we increment softlockcnt on the 'real' segment.
     */
    sptd->spt_pcachecnt++;
    atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);

    ppa = sptd->spt_ppa;
    for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
        if (ppa[an_idx] == NULL) {
            mutex_exit(&sptd->spt_lock);
            seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
                pl, sptd->spt_prot, segspt_reclaim);
            *ppp = NULL;
            return (ENOTSUP);
        }
        if ((szc = ppa[an_idx]->p_szc) != 0) {
            npgs = page_get_pagecnt(szc);
            an_idx = P2ROUNDUP(an_idx + 1, npgs);
        } else {
            an_idx++;
        }
    }
    /*
     * We can now drop the sptd->spt_lock since the ppa[]
     * exists and he have incremented pacachecnt.
     */
    mutex_exit(&sptd->spt_lock);

    /*
     * Since we cache the entire segment, we want to
     * set ppp to point to the first slot that corresponds
     * to the requested addr, i.e. pg_idx.
     */
    *ppp = &(sptd->spt_ppa[pg_idx]);
    return (ret);

insert_fail:
    /*
     * We will only reach this code if we tried and failed.
     *
     * And we can drop the lock on the dummy seg, once we've failed
     * to set up a new ppa[].
     */
    mutex_exit(&sptd->spt_lock);

    if (pl_built) {
        mutex_enter(&freemem_lock);
        availrmem += claim_availrmem;
        mutex_exit(&freemem_lock);

        /*
         * We created pl and we need to destroy it.
         */
        pplist = pl;
        for (an_idx = 0; an_idx < tot_npages; an_idx++) {
            if (pplist[an_idx] != NULL)
                page_unlock(pplist[an_idx]);
        }
        kmem_free(pl, sizeof (page_t *) * tot_npages);
    }

    if (shmd->shm_softlockcnt <= 0) {
        if (AS_ISUNMAPWAIT(seg->s_as)) {
            mutex_enter(&seg->s_as->a_contents);
            if (AS_ISUNMAPWAIT(seg->s_as)) {
                AS_CLRUNMAPWAIT(seg->s_as);
                cv_broadcast(&seg->s_as->a_cv);
            }
            mutex_exit(&seg->s_as->a_contents);
        }
    }
    *ppp = NULL;
    return (ret);
}


/*
 * return locked pages over a given range.
 *
 * We will cache the entire ISM segment and save the pplist for the
 * entire segment in the ppa field of the underlying ISM segment structure.
 * Later, during a call to segspt_reclaim() we will use this ppa array
 * to page_unlock() all of the pages and then we will free this ppa list.
 */
/*ARGSUSED*/
static int
segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
    struct page ***ppp, enum lock_type type, enum seg_rw rw)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct seg  *sptseg = shmd->shm_sptseg;
    struct spt_data *sptd = sptseg->s_data;
    pgcnt_t np, page_index, npages;
    caddr_t a, spt_base;
    struct page **pplist, **pl, *pp;
    struct anon_map *amp;
    ulong_t anon_index;
    int ret = ENOTSUP;
    uint_t  pl_built = 0;
    struct anon *ap;
    struct vnode *vp;
    u_offset_t off;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * We want to lock/unlock the entire ISM segment. Therefore,
     * we will be using the underlying sptseg and it's base address
     * and length for the caching arguments.
     */
    ASSERT(sptseg);
    ASSERT(sptd);

    if (sptd->spt_flags & SHM_PAGEABLE) {
        return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
    }

    page_index = seg_page(seg, addr);
    npages = btopr(len);

    /*
     * check if the request is larger than number of pages covered
     * by amp
     */
    if (page_index + npages > btopr(sptd->spt_amp->size)) {
        *ppp = NULL;
        return (ENOTSUP);
    }

    if (type == L_PAGEUNLOCK) {

        ASSERT(sptd->spt_ppa != NULL);

        seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
            sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);

        /*
         * If someone is blocked while unmapping, we purge
         * segment page cache and thus reclaim pplist synchronously
         * without waiting for seg_pasync_thread. This speeds up
         * unmapping in cases where munmap(2) is called, while
         * raw async i/o is still in progress or where a thread
         * exits on data fault in a multithreaded application.
         */
        if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
            segspt_purge(seg);
        }
        return (0);
    } else if (type == L_PAGERECLAIM) {
        ASSERT(sptd->spt_ppa != NULL);

        (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
            sptd->spt_ppa, sptd->spt_prot);
        return (0);
    }

    /*
     * First try to find pages in segment page cache, without
     * holding the segment lock.
     */
    pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
        sptd->spt_prot);
    if (pplist != NULL) {
        ASSERT(sptd->spt_ppa == pplist);
        ASSERT(sptd->spt_ppa[page_index]);
        /*
         * Since we cache the entire ISM segment, we want to
         * set ppp to point to the first slot that corresponds
         * to the requested addr, i.e. page_index.
         */
        *ppp = &(sptd->spt_ppa[page_index]);
        return (0);
    }

    /* The L_PAGELOCK case... */
    mutex_enter(&sptd->spt_lock);

    /*
     * try to find pages in segment page cache
     */
    pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
        sptd->spt_prot);
    if (pplist != NULL) {
        ASSERT(sptd->spt_ppa == pplist);
        /*
         * Since we cache the entire segment, we want to
         * set ppp to point to the first slot that corresponds
         * to the requested addr, i.e. page_index.
         */
        mutex_exit(&sptd->spt_lock);
        *ppp = &(sptd->spt_ppa[page_index]);
        return (0);
    }

    if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
        SEGP_FAIL) {
        mutex_exit(&sptd->spt_lock);
        *ppp = NULL;
        return (ENOTSUP);
    }

    /*
     * No need to worry about protections because ISM pages
     * are always rw.
     */
    pl = pplist = NULL;

    /*
     * Do we need to build the ppa array?
     */
    if (sptd->spt_ppa == NULL) {
        ASSERT(sptd->spt_ppa == pplist);

        spt_base = sptseg->s_base;
        pl_built = 1;

        /*
         * availrmem is decremented once during anon_swap_adjust()
         * and is incremented during the anon_unresv(), which is
         * called from shm_rm_amp() when the segment is destroyed.
         */
        amp = sptd->spt_amp;
        ASSERT(amp != NULL);

        /* pcachecnt is protected by sptd->spt_lock */
        ASSERT(sptd->spt_pcachecnt == 0);
        pplist = kmem_zalloc(sizeof (page_t *)
            * btopr(sptd->spt_amp->size), KM_SLEEP);
        pl = pplist;

        anon_index = seg_page(sptseg, spt_base);

        ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
        for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
            a += PAGESIZE, anon_index++, pplist++) {
            ap = anon_get_ptr(amp->ahp, anon_index);
            ASSERT(ap != NULL);
            swap_xlate(ap, &vp, &off);
            pp = page_lookup(vp, off, SE_SHARED);
            ASSERT(pp != NULL);
            *pplist = pp;
        }
        ANON_LOCK_EXIT(&amp->a_rwlock);

        if (a < (spt_base + sptd->spt_amp->size)) {
            ret = ENOTSUP;
            goto insert_fail;
        }
        sptd->spt_ppa = pl;
    } else {
        /*
         * We already have a valid ppa[].
         */
        pl = sptd->spt_ppa;
    }

    ASSERT(pl != NULL);

    ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
        pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim);
    if (ret == SEGP_FAIL) {
        /*
         * seg_pinsert failed. We return
         * ENOTSUP, so that the as_pagelock() code will
         * then try the slower F_SOFTLOCK path.
         */
        if (pl_built) {
            /*
             * No one else has referenced the ppa[].
             * We created it and we need to destroy it.
             */
            sptd->spt_ppa = NULL;
        }
        ret = ENOTSUP;
        goto insert_fail;
    }

    /*
     * In either case, we increment softlockcnt on the 'real' segment.
     */
    sptd->spt_pcachecnt++;
    atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);

    /*
     * We can now drop the sptd->spt_lock since the ppa[]
     * exists and he have incremented pacachecnt.
     */
    mutex_exit(&sptd->spt_lock);

    /*
     * Since we cache the entire segment, we want to
     * set ppp to point to the first slot that corresponds
     * to the requested addr, i.e. page_index.
     */
    *ppp = &(sptd->spt_ppa[page_index]);
    return (ret);

insert_fail:
    /*
     * We will only reach this code if we tried and failed.
     *
     * And we can drop the lock on the dummy seg, once we've failed
     * to set up a new ppa[].
     */
    mutex_exit(&sptd->spt_lock);

    if (pl_built) {
        /*
         * We created pl and we need to destroy it.
         */
        pplist = pl;
        np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
        while (np) {
            page_unlock(*pplist);
            np--;
            pplist++;
        }
        kmem_free(pl, sizeof (page_t *) *
                btopr(sptd->spt_amp->size));
    }
    if (shmd->shm_softlockcnt <= 0) {
        if (AS_ISUNMAPWAIT(seg->s_as)) {
            mutex_enter(&seg->s_as->a_contents);
            if (AS_ISUNMAPWAIT(seg->s_as)) {
                AS_CLRUNMAPWAIT(seg->s_as);
                cv_broadcast(&seg->s_as->a_cv);
            }
            mutex_exit(&seg->s_as->a_contents);
        }
    }
    *ppp = NULL;
    return (ret);
}

/*
 * purge any cached pages in the I/O page cache
 */
static void
segspt_purge(struct seg *seg)
{
    seg_ppurge(seg);
}

static int
segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
    enum seg_rw rw)
{
    struct  shm_data *shmd = (struct shm_data *)seg->s_data;
    struct  seg *sptseg;
    struct  spt_data *sptd;
    pgcnt_t npages, i, free_availrmem = 0;
    int done = 0;

#ifdef lint
    addr = addr;
#endif
    sptseg = shmd->shm_sptseg;
    sptd = sptseg->s_data;
    npages = (len >> PAGESHIFT);
    ASSERT(npages);
    ASSERT(sptd->spt_pcachecnt != 0);
    ASSERT(sptd->spt_ppa == pplist);
    ASSERT(npages == btopr(sptd->spt_amp->size));

    /*
     * Acquire the lock on the dummy seg and destroy the
     * ppa array IF this is the last pcachecnt.
     */
    mutex_enter(&sptd->spt_lock);
    if (--sptd->spt_pcachecnt == 0) {
        for (i = 0; i < npages; i++) {
            if (pplist[i] == NULL) {
                continue;
            }
            if (rw == S_WRITE) {
                hat_setrefmod(pplist[i]);
            } else {
                hat_setref(pplist[i]);
            }
            if ((sptd->spt_flags & SHM_PAGEABLE) &&
                (sptd->spt_ppa_lckcnt[i] == 0))
                free_availrmem++;
            page_unlock(pplist[i]);
        }
        if (sptd->spt_flags & SHM_PAGEABLE) {
            mutex_enter(&freemem_lock);
            availrmem += free_availrmem;
            mutex_exit(&freemem_lock);
        }
        /*
         * Since we want to cach/uncache the entire ISM segment,
         * we will track the pplist in a segspt specific field
         * ppa, that is initialized at the time we add an entry to
         * the cache.
         */
        ASSERT(sptd->spt_pcachecnt == 0);
        kmem_free(pplist, sizeof (page_t *) * npages);
        sptd->spt_ppa = NULL;
        sptd->spt_flags &= ~DISM_PPA_CHANGED;
        done = 1;
    }
    mutex_exit(&sptd->spt_lock);
    /*
     * Now decrement softlockcnt.
     */
    atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1);

    if (shmd->shm_softlockcnt <= 0) {
        if (AS_ISUNMAPWAIT(seg->s_as)) {
            mutex_enter(&seg->s_as->a_contents);
            if (AS_ISUNMAPWAIT(seg->s_as)) {
                AS_CLRUNMAPWAIT(seg->s_as);
                cv_broadcast(&seg->s_as->a_cv);
            }
            mutex_exit(&seg->s_as->a_contents);
        }
    }
    return (done);
}

/*
 * Do a F_SOFTUNLOCK call over the range requested.
 * The range must have already been F_SOFTLOCK'ed.
 *
 * The calls to acquire and release the anon map lock mutex were
 * removed in order to avoid a deadly embrace during a DR
 * memory delete operation.  (Eg. DR blocks while waiting for a
 * exclusive lock on a page that is being used for kaio; the
 * thread that will complete the kaio and call segspt_softunlock
 * blocks on the anon map lock; another thread holding the anon
 * map lock blocks on another page lock via the segspt_shmfault
 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
 *
 * The appropriateness of the removal is based upon the following:
 * 1. If we are holding a segment's reader lock and the page is held
 * shared, then the corresponding element in anonmap which points to
 * anon struct cannot change and there is no need to acquire the
 * anonymous map lock.
 * 2. Threads in segspt_softunlock have a reader lock on the segment
 * and already have the shared page lock, so we are guaranteed that
 * the anon map slot cannot change and therefore can call anon_get_ptr()
 * without grabbing the anonymous map lock.
 * 3. Threads that softlock a shared page break copy-on-write, even if
 * its a read.  Thus cow faults can be ignored with respect to soft
 * unlocking, since the breaking of cow means that the anon slot(s) will
 * not be shared.
 */
static void
segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
    size_t len, enum seg_rw rw)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct seg  *sptseg;
    struct spt_data *sptd;
    page_t *pp;
    caddr_t adr;
    struct vnode *vp;
    u_offset_t offset;
    ulong_t anon_index;
    struct anon_map *amp;       /* XXX - for locknest */
    struct anon *ap = NULL;
    pgcnt_t npages;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    sptseg = shmd->shm_sptseg;
    sptd = sptseg->s_data;

    /*
     * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
     * and therefore their pages are SE_SHARED locked
     * for the entire life of the segment.
     */
    if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
        ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
        goto softlock_decrement;
    }

    /*
     * Any thread is free to do a page_find and
     * page_unlock() on the pages within this seg.
     *
     * We are already holding the as->a_lock on the user's
     * real segment, but we need to hold the a_lock on the
     * underlying dummy as. This is mostly to satisfy the
     * underlying HAT layer.
     */
    AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
    hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
    AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);

    amp = sptd->spt_amp;
    ASSERT(amp != NULL);
    anon_index = seg_page(sptseg, sptseg_addr);

    for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
        ap = anon_get_ptr(amp->ahp, anon_index++);
        ASSERT(ap != NULL);
        swap_xlate(ap, &vp, &offset);

        /*
         * Use page_find() instead of page_lookup() to
         * find the page since we know that it has a
         * "shared" lock.
         */
        pp = page_find(vp, offset);
        ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
        if (pp == NULL) {
            panic("segspt_softunlock: "
                "addr %p, ap %p, vp %p, off %llx",
                (void *)adr, (void *)ap, (void *)vp, offset);
            /*NOTREACHED*/
        }

        if (rw == S_WRITE) {
            hat_setrefmod(pp);
        } else if (rw != S_OTHER) {
            hat_setref(pp);
        }
        page_unlock(pp);
    }

softlock_decrement:
    npages = btopr(len);
    atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
    if (shmd->shm_softlockcnt == 0) {
        /*
         * All SOFTLOCKS are gone. Wakeup any waiting
         * unmappers so they can try again to unmap.
         * Check for waiters first without the mutex
         * held so we don't always grab the mutex on
         * softunlocks.
         */
        if (AS_ISUNMAPWAIT(seg->s_as)) {
            mutex_enter(&seg->s_as->a_contents);
            if (AS_ISUNMAPWAIT(seg->s_as)) {
                AS_CLRUNMAPWAIT(seg->s_as);
                cv_broadcast(&seg->s_as->a_cv);
            }
            mutex_exit(&seg->s_as->a_contents);
        }
    }
}

int
segspt_shmattach(struct seg *seg, caddr_t *argsp)
{
    struct shm_data *shmd_arg = (struct shm_data *)argsp;
    struct shm_data *shmd;
    struct anon_map *shm_amp = shmd_arg->shm_amp;
    struct spt_data *sptd;
    int error = 0;

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

    shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
    if (shmd == NULL)
        return (ENOMEM);

    shmd->shm_sptas = shmd_arg->shm_sptas;
    shmd->shm_amp = shm_amp;
    shmd->shm_sptseg = shmd_arg->shm_sptseg;

    (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
        NULL, 0, seg->s_size);

    seg->s_data = (void *)shmd;
    seg->s_ops = &segspt_shmops;
    seg->s_szc = shmd->shm_sptseg->s_szc;
    sptd = shmd->shm_sptseg->s_data;

    if (sptd->spt_flags & SHM_PAGEABLE) {
        if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
            KM_NOSLEEP)) == NULL) {
            seg->s_data = (void *)NULL;
            kmem_free(shmd, (sizeof (*shmd)));
            return (ENOMEM);
        }
        shmd->shm_lckpgs = 0;
        if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
            if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
                shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
                seg->s_size, seg->s_szc)) != 0) {
                kmem_free(shmd->shm_vpage,
                    btopr(shm_amp->size));
            }
        }
    } else {
        error = hat_share(seg->s_as->a_hat, seg->s_base,
                shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
                seg->s_size, seg->s_szc);
    }
    if (error) {
        seg->s_szc = 0;
        seg->s_data = (void *)NULL;
        kmem_free(shmd, (sizeof (*shmd)));
    } else {
        ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
        shm_amp->refcnt++;
        ANON_LOCK_EXIT(&shm_amp->a_rwlock);
    }
    return (error);
}

int
segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    int reclaim = 1;

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
retry:
    if (shmd->shm_softlockcnt > 0) {
        if (reclaim == 1) {
            segspt_purge(seg);
            reclaim = 0;
            goto retry;
        }
        return (EAGAIN);
    }

    if (ssize != seg->s_size) {
#ifdef DEBUG
        cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
            ssize, seg->s_size);
#endif
        return (EINVAL);
    }

    (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
        NULL, 0);
    hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);

    seg_free(seg);

    return (0);
}

void
segspt_shmfree(struct seg *seg)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct anon_map *shm_amp = shmd->shm_amp;

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

    (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
        MC_UNLOCK, NULL, 0);

    /*
     * Need to increment refcnt when attaching
     * and decrement when detaching because of dup().
     */
    ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
    shm_amp->refcnt--;
    ANON_LOCK_EXIT(&shm_amp->a_rwlock);

    if (shmd->shm_vpage) {  /* only for DISM */
        kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
        shmd->shm_vpage = NULL;
    }
    kmem_free(shmd, sizeof (*shmd));
}

/*ARGSUSED*/
int
segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
{
    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * Shared page table is more than shared mapping.
     *  Individual process sharing page tables can't change prot
     *  because there is only one set of page tables.
     *  This will be allowed after private page table is
     *  supported.
     */
/* need to return correct status error? */
    return (0);
}


faultcode_t
segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
    size_t len, enum fault_type type, enum seg_rw rw)
{
    struct  shm_data    *shmd = (struct shm_data *)seg->s_data;
    struct  seg     *sptseg = shmd->shm_sptseg;
    struct  as      *curspt = shmd->shm_sptas;
    struct  spt_data    *sptd = sptseg->s_data;
    pgcnt_t npages;
    size_t  share_sz, size;
    caddr_t segspt_addr, shm_addr;
    page_t  **ppa;
    int i;
    ulong_t an_idx = 0;
    int err = 0;

#ifdef lint
    hat = hat;
#endif
    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * Because of the way spt is implemented
     * the realsize of the segment does not have to be
     * equal to the segment size itself. The segment size is
     * often in multiples of a page size larger than PAGESIZE.
     * The realsize is rounded up to the nearest PAGESIZE
     * based on what the user requested. This is a bit of
     * ungliness that is historical but not easily fixed
     * without re-designing the higher levels of ISM.
     */
    ASSERT(addr >= seg->s_base);
    if (((addr + len) - seg->s_base) > sptd->spt_realsize)
        return (FC_NOMAP);
    /*
     * For all of the following cases except F_PROT, we need to
     * make any necessary adjustments to addr and len
     * and get all of the necessary page_t's into an array called ppa[].
     *
     * The code in shmat() forces base addr and len of ISM segment
     * to be aligned to largest page size supported. Therefore,
     * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
     * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
     * in large pagesize chunks, or else we will screw up the HAT
     * layer by calling hat_memload_array() with differing page sizes
     * over a given virtual range.
     */
    share_sz = page_get_pagesize(sptseg->s_szc);
    shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
    size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_sz);
    npages = btopr(size);

    /*
     * Now we need to convert from addr in segshm to addr in segspt.
     */
    an_idx = seg_page(seg, shm_addr);
    segspt_addr = sptseg->s_base + ptob(an_idx);

    ASSERT((segspt_addr + ptob(npages)) <=
        (sptseg->s_base + sptd->spt_realsize));
    ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));

    switch (type) {

    case F_SOFTLOCK:

        mutex_enter(&freemem_lock);
        if (availrmem < tune.t_minarmem + npages) {
            mutex_exit(&freemem_lock);
            return (FC_MAKE_ERR(ENOMEM));
        } else {
            availrmem -= npages;
        }
        mutex_exit(&freemem_lock);
        atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
        /*
         * Fall through to the F_INVAL case to load up the hat layer
         * entries with the HAT_LOAD_LOCK flag.
         */
        /* FALLTHRU */
    case F_INVAL:

        if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
            return (FC_NOMAP);

        ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);

        err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
        if (err != 0) {
            if (type == F_SOFTLOCK) {
                mutex_enter(&freemem_lock);
                availrmem += npages;
                mutex_exit(&freemem_lock);
                atomic_add_long((ulong_t *)(
                    &(shmd->shm_softlockcnt)), -npages);
            }
            goto dism_err;
        }
        AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
        if (type == F_SOFTLOCK) {

            /*
             * Load up the translation keeping it
             * locked and don't unlock the page.
             */
            hat_memload_array(sptseg->s_as->a_hat, segspt_addr,
                size, ppa, sptd->spt_prot,
                HAT_LOAD_LOCK | HAT_LOAD_SHARE);
        } else {
            if (hat == seg->s_as->a_hat) {

                /*
                 * Migrate pages marked for migration
                 */
                if (lgrp_optimizations())
                    page_migrate(seg, shm_addr, ppa,
                        npages);

                /* CPU HAT */
                hat_memload_array(sptseg->s_as->a_hat,
                    segspt_addr, size, ppa, sptd->spt_prot,
                    HAT_LOAD_SHARE);
            } else {
                /* XHAT. Pass real address */
                hat_memload_array(hat, shm_addr,
                    size, ppa, sptd->spt_prot, HAT_LOAD_SHARE);
            }

            /*
             * And now drop the SE_SHARED lock(s).
             */
            for (i = 0; i < npages; i++)
                page_unlock(ppa[i]);
        }

        if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
            if (hat_share(seg->s_as->a_hat, shm_addr,
                curspt->a_hat, segspt_addr, ptob(npages),
                seg->s_szc) != 0) {
                panic("hat_share err in DISM fault");
                /* NOTREACHED */
            }
        }
        AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
dism_err:
        kmem_free(ppa, npages * sizeof (page_t *));
        return (err);

    case F_SOFTUNLOCK:

        mutex_enter(&freemem_lock);
        availrmem += npages;
        mutex_exit(&freemem_lock);

        /*
         * This is a bit ugly, we pass in the real seg pointer,
         * but the segspt_addr is the virtual address within the
         * dummy seg.
         */
        segspt_softunlock(seg, segspt_addr, size, rw);
        return (0);

    case F_PROT:

        /*
         * This takes care of the unusual case where a user
         * allocates a stack in shared memory and a register
         * window overflow is written to that stack page before
         * it is otherwise modified.
         *
         * We can get away with this because ISM segments are
         * always rw. Other than this unusual case, there
         * should be no instances of protection violations.
         */
        return (0);

    default:
#ifdef DEBUG
        panic("segspt_dismfault default type?");
#else
        return (FC_NOMAP);
#endif
    }
}


faultcode_t
segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
    size_t len, enum fault_type type, enum seg_rw rw)
{
    struct shm_data     *shmd = (struct shm_data *)seg->s_data;
    struct seg      *sptseg = shmd->shm_sptseg;
    struct as       *curspt = shmd->shm_sptas;
    struct spt_data     *sptd   = sptseg->s_data;
    pgcnt_t npages;
    size_t share_size, size;
    caddr_t sptseg_addr, shm_addr;
    page_t *pp, **ppa;
    int i;
    u_offset_t offset;
    ulong_t anon_index = 0;
    struct vnode *vp;
    struct anon_map *amp;       /* XXX - for locknest */
    struct anon *ap = NULL;
    anon_sync_obj_t cookie;

#ifdef lint
    hat = hat;
#endif

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    if (sptd->spt_flags & SHM_PAGEABLE) {
        return (segspt_dismfault(hat, seg, addr, len, type, rw));
    }

    /*
     * Because of the way spt is implemented
     * the realsize of the segment does not have to be
     * equal to the segment size itself. The segment size is
     * often in multiples of a page size larger than PAGESIZE.
     * The realsize is rounded up to the nearest PAGESIZE
     * based on what the user requested. This is a bit of
     * ungliness that is historical but not easily fixed
     * without re-designing the higher levels of ISM.
     */
    ASSERT(addr >= seg->s_base);
    if (((addr + len) - seg->s_base) > sptd->spt_realsize)
        return (FC_NOMAP);
    /*
     * For all of the following cases except F_PROT, we need to
     * make any necessary adjustments to addr and len
     * and get all of the necessary page_t's into an array called ppa[].
     *
     * The code in shmat() forces base addr and len of ISM segment
     * to be aligned to largest page size supported. Therefore,
     * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
     * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
     * in large pagesize chunks, or else we will screw up the HAT
     * layer by calling hat_memload_array() with differing page sizes
     * over a given virtual range.
     */
    share_size = page_get_pagesize(sptseg->s_szc);
    shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
    size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_size);
    npages = btopr(size);

    /*
     * Now we need to convert from addr in segshm to addr in segspt.
     */
    anon_index = seg_page(seg, shm_addr);
    sptseg_addr = sptseg->s_base + ptob(anon_index);

    /*
     * And now we may have to adjust npages downward if we have
     * exceeded the realsize of the segment or initial anon
     * allocations.
     */
    if ((sptseg_addr + ptob(npages)) >
        (sptseg->s_base + sptd->spt_realsize))
        size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;

    npages = btopr(size);

    ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
    ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);

    switch (type) {

    case F_SOFTLOCK:

        /*
         * availrmem is decremented once during anon_swap_adjust()
         * and is incremented during the anon_unresv(), which is
         * called from shm_rm_amp() when the segment is destroyed.
         */
        atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
        /*
         * Some platforms assume that ISM pages are SE_SHARED
         * locked for the entire life of the segment.
         */
        if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
            return (0);
        /*
         * Fall through to the F_INVAL case to load up the hat layer
         * entries with the HAT_LOAD_LOCK flag.
         */

        /* FALLTHRU */
    case F_INVAL:

        if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
            return (FC_NOMAP);

        /*
         * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
         * may still rely on this call to hat_share(). That
         * would imply that those hat's can fault on a
         * HAT_LOAD_LOCK translation, which would seem
         * contradictory.
         */
        if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
            if (hat_share(seg->s_as->a_hat, seg->s_base,
                curspt->a_hat, sptseg->s_base,
                sptseg->s_size, sptseg->s_szc) != 0) {
                panic("hat_share error in ISM fault");
                /*NOTREACHED*/
            }
            return (0);
        }
        ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);

        /*
         * I see no need to lock the real seg,
         * here, because all of our work will be on the underlying
         * dummy seg.
         *
         * sptseg_addr and npages now account for large pages.
         */
        amp = sptd->spt_amp;
        ASSERT(amp != NULL);
        anon_index = seg_page(sptseg, sptseg_addr);

        ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
        for (i = 0; i < npages; i++) {
            anon_array_enter(amp, anon_index, &cookie);
            ap = anon_get_ptr(amp->ahp, anon_index++);
            ASSERT(ap != NULL);
            swap_xlate(ap, &vp, &offset);
            anon_array_exit(&cookie);
            pp = page_lookup(vp, offset, SE_SHARED);
            ASSERT(pp != NULL);
            ppa[i] = pp;
        }
        ANON_LOCK_EXIT(&amp->a_rwlock);
        ASSERT(i == npages);

        /*
         * We are already holding the as->a_lock on the user's
         * real segment, but we need to hold the a_lock on the
         * underlying dummy as. This is mostly to satisfy the
         * underlying HAT layer.
         */
        AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
        if (type == F_SOFTLOCK) {
            /*
             * Load up the translation keeping it
             * locked and don't unlock the page.
             */
            hat_memload_array(sptseg->s_as->a_hat, sptseg_addr,
                ptob(npages), ppa, sptd->spt_prot,
                HAT_LOAD_LOCK | HAT_LOAD_SHARE);
        } else {
            if (hat == seg->s_as->a_hat) {

                /*
                 * Migrate pages marked for migration.
                 */
                if (lgrp_optimizations())
                    page_migrate(seg, shm_addr, ppa,
                        npages);

                /* CPU HAT */
                hat_memload_array(sptseg->s_as->a_hat,
                    sptseg_addr, ptob(npages), ppa,
                    sptd->spt_prot, HAT_LOAD_SHARE);
            } else {
                /* XHAT. Pass real address */
                hat_memload_array(hat, shm_addr,
                    ptob(npages), ppa, sptd->spt_prot,
                    HAT_LOAD_SHARE);
            }

            /*
             * And now drop the SE_SHARED lock(s).
             */
            for (i = 0; i < npages; i++)
                page_unlock(ppa[i]);
        }
        AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);

        kmem_free(ppa, sizeof (page_t *) * npages);
        return (0);
    case F_SOFTUNLOCK:

        /*
         * This is a bit ugly, we pass in the real seg pointer,
         * but the sptseg_addr is the virtual address within the
         * dummy seg.
         */
        segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
        return (0);

    case F_PROT:

        /*
         * This takes care of the unusual case where a user
         * allocates a stack in shared memory and a register
         * window overflow is written to that stack page before
         * it is otherwise modified.
         *
         * We can get away with this because ISM segments are
         * always rw. Other than this unusual case, there
         * should be no instances of protection violations.
         */
        return (0);

    default:
#ifdef DEBUG
        cmn_err(CE_WARN, "segspt_shmfault default type?");
#endif
        return (FC_NOMAP);
    }
}

/*ARGSUSED*/
static faultcode_t
segspt_shmfaulta(struct seg *seg, caddr_t addr)
{
    return (0);
}

/*ARGSUSED*/
static int
segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
{
    return (0);
}

/*ARGSUSED*/
static size_t
segspt_shmswapout(struct seg *seg)
{
    return (0);
}

/*
 * duplicate the shared page tables
 */
int
segspt_shmdup(struct seg *seg, struct seg *newseg)
{
    struct shm_data     *shmd = (struct shm_data *)seg->s_data;
    struct anon_map     *amp = shmd->shm_amp;
    struct shm_data     *shmd_new;
    struct seg      *spt_seg = shmd->shm_sptseg;
    struct spt_data     *sptd = spt_seg->s_data;

    ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

    shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
    newseg->s_data = (void *)shmd_new;
    shmd_new->shm_sptas = shmd->shm_sptas;
    shmd_new->shm_amp = amp;
    shmd_new->shm_sptseg = shmd->shm_sptseg;
    newseg->s_ops = &segspt_shmops;
    newseg->s_szc = seg->s_szc;
    ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);

    ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    amp->refcnt++;
    ANON_LOCK_EXIT(&amp->a_rwlock);

    if (sptd->spt_flags & SHM_PAGEABLE) {
        shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
        shmd_new->shm_lckpgs = 0;
    }
    return (hat_share(newseg->s_as->a_hat, newseg->s_base,
        shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc));
}

/*ARGSUSED*/
int
segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * ISM segment is always rw.
     */
    return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
}

/*
 * Return an array of locked large pages, for empty slots allocate
 * private zero-filled anon pages.
 */
static int
spt_anon_getpages(
    struct seg *sptseg,
    caddr_t sptaddr,
    size_t len,
    page_t *ppa[])
{
    struct  spt_data *sptd = sptseg->s_data;
    struct  anon_map *amp = sptd->spt_amp;
    enum    seg_rw rw = sptd->spt_prot;
    uint_t  szc = sptseg->s_szc;
    size_t  pg_sz, share_sz = page_get_pagesize(szc);
    pgcnt_t lp_npgs;
    caddr_t lp_addr, e_sptaddr;
    uint_t  vpprot, ppa_szc = 0;
    struct  vpage *vpage = NULL;
    ulong_t j, ppa_idx;
    int err, ierr = 0;
    pgcnt_t an_idx;
    anon_sync_obj_t cookie;

    ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
    ASSERT(len != 0);

    pg_sz = share_sz;
    lp_npgs = btop(pg_sz);
    lp_addr = sptaddr;
    e_sptaddr = sptaddr + len;
    an_idx = seg_page(sptseg, sptaddr);
    ppa_idx = 0;

    ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
    /*CONSTCOND*/
    while (1) {
        for (; lp_addr < e_sptaddr;
            an_idx += lp_npgs, lp_addr += pg_sz,
            ppa_idx += lp_npgs) {

            anon_array_enter(amp, an_idx, &cookie);
            ppa_szc = (uint_t)-1;
            ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
                lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
                &ppa_szc, vpage, rw, 0, segvn_anypgsz, kcred);
            anon_array_exit(&cookie);

            if (ierr != 0) {
                if (ierr > 0) {
                    err = FC_MAKE_ERR(ierr);
                    goto lpgs_err;
                }
                break;
            }
        }
        if (lp_addr == e_sptaddr) {
            break;
        }
        ASSERT(lp_addr < e_sptaddr);

        /*
         * ierr == -1 means we failed to allocate a large page.
         * so do a size down operation.
         *
         * ierr == -2 means some other process that privately shares
         * pages with this process has allocated a larger page and we
         * need to retry with larger pages. So do a size up
         * operation. This relies on the fact that large pages are
         * never partially shared i.e. if we share any constituent
         * page of a large page with another process we must share the
         * entire large page. Note this cannot happen for SOFTLOCK
         * case, unless current address (lpaddr) is at the beginning
         * of the next page size boundary because the other process
         * couldn't have relocated locked pages.
         */
        ASSERT(ierr == -1 || ierr == -2);
        if (segvn_anypgsz) {
            ASSERT(ierr == -2 || szc != 0);
            ASSERT(ierr == -1 || szc < sptseg->s_szc);
            szc = (ierr == -1) ? szc - 1 : szc + 1;
        } else {
            /*
             * For faults and segvn_anypgsz == 0
             * we need to be careful not to loop forever
             * if existing page is found with szc other
             * than 0 or seg->s_szc. This could be due
             * to page relocations on behalf of DR or
             * more likely large page creation. For this
             * case simply re-size to existing page's szc
             * if returned by anon_map_getpages().
             */
            if (ppa_szc == (uint_t)-1) {
                szc = (ierr == -1) ? 0 : sptseg->s_szc;
            } else {
                ASSERT(ppa_szc <= sptseg->s_szc);
                ASSERT(ierr == -2 || ppa_szc < szc);
                ASSERT(ierr == -1 || ppa_szc > szc);
                szc = ppa_szc;
            }
        }
        pg_sz = page_get_pagesize(szc);
        lp_npgs = btop(pg_sz);
        ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
    }
    ANON_LOCK_EXIT(&amp->a_rwlock);
    return (0);

lpgs_err:
    ANON_LOCK_EXIT(&amp->a_rwlock);
    for (j = 0; j < ppa_idx; j++)
        page_unlock(ppa[j]);
    return (err);
}

int
spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
    page_t **ppa, ulong_t *lockmap, size_t pos)
{
    struct shm_data *shmd = seg->s_data;
    struct spt_data *sptd = shmd->shm_sptseg->s_data;
    ulong_t i;
    int kernel;

    for (i = 0; i < npages; anon_index++, pos++, i++) {
        if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
            if (sptd->spt_ppa_lckcnt[anon_index] <
                (ushort_t)DISM_LOCK_MAX) {
                if (++sptd->spt_ppa_lckcnt[anon_index] ==
                    (ushort_t)DISM_LOCK_MAX) {
                    cmn_err(CE_WARN,
                        "DISM page lock limit "
                        "reached on DISM offset 0x%lx\n",
                        anon_index << PAGESHIFT);
                }
                kernel = (sptd->spt_ppa &&
                    sptd->spt_ppa[anon_index]) ? 1 : 0;
                if (!page_pp_lock(ppa[i], 0, kernel)) {
                    /* unlock rest of the pages */
                    for (; i < npages; i++)
                        page_unlock(ppa[i]);
                    sptd->spt_ppa_lckcnt[anon_index]--;
                    return (EAGAIN);
                }
                shmd->shm_lckpgs++;
                shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
                if (lockmap != NULL)
                    BT_SET(lockmap, pos);
            }
        }
        page_unlock(ppa[i]);
    }
    return (0);
}

/*ARGSUSED*/
static int
segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
    int attr, int op, ulong_t *lockmap, size_t pos)
{
    struct shm_data *shmd = seg->s_data;
    struct seg  *sptseg = shmd->shm_sptseg;
    struct spt_data *sptd = sptseg->s_data;
    pgcnt_t     npages, a_npages;
    page_t      **ppa;
    pgcnt_t     an_idx, a_an_idx, ppa_idx;
    caddr_t     spt_addr, a_addr;   /* spt and aligned address */
    size_t      a_len;          /* aligned len */
    size_t      share_sz;
    ulong_t     i;
    int     sts = 0;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
        return (0);
    }

    addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
    an_idx = seg_page(seg, addr);
    npages = btopr(len);

    if (an_idx + npages > btopr(shmd->shm_amp->size)) {
        return (ENOMEM);
    }

    if (op == MC_LOCK) {
        /*
         * Need to align addr and size request if they are not
         * aligned so we can always allocate large page(s) however
         * we only lock what was requested in initial request.
         */
        share_sz = page_get_pagesize(sptseg->s_szc);
        a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
        a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
                share_sz);
        a_npages = btop(a_len);
        a_an_idx = seg_page(seg, a_addr);
        spt_addr = sptseg->s_base + ptob(a_an_idx);
        ppa_idx = an_idx - a_an_idx;

        if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
            KM_NOSLEEP)) == NULL) {
            return (ENOMEM);
        }

        /*
         * Don't cache any new pages for IO and
         * flush any cached pages.
         */
        mutex_enter(&sptd->spt_lock);
        if (sptd->spt_ppa != NULL)
            sptd->spt_flags |= DISM_PPA_CHANGED;

        sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
        if (sts != 0) {
            mutex_exit(&sptd->spt_lock);
            kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
            return (sts);
        }

        sts = spt_lockpages(seg, an_idx, npages,
            &ppa[ppa_idx], lockmap, pos);
        /*
         * unlock remaining pages for requests which are not
         * aligned or not in 4 M chunks
         */
        for (i = 0; i < ppa_idx; i++)
            page_unlock(ppa[i]);
        for (i = ppa_idx + npages; i < a_npages; i++)
            page_unlock(ppa[i]);
        if (sptd->spt_ppa != NULL)
            sptd->spt_flags |= DISM_PPA_CHANGED;
        mutex_exit(&sptd->spt_lock);

        kmem_free(ppa, ((sizeof (page_t *)) * a_npages));

    } else if (op == MC_UNLOCK) { /* unlock */
        struct anon_map *amp;
        struct anon     *ap;
        struct vnode    *vp;
        u_offset_t  off;
        struct page *pp;
        int     kernel;
        anon_sync_obj_t cookie;

        amp = sptd->spt_amp;
        mutex_enter(&sptd->spt_lock);
        if (shmd->shm_lckpgs == 0) {
            mutex_exit(&sptd->spt_lock);
            return (0);
        }
        /*
         * Don't cache new IO pages.
         */
        if (sptd->spt_ppa != NULL)
            sptd->spt_flags |= DISM_PPA_CHANGED;

        ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
        for (i = 0; i < npages; i++, an_idx++) {
            if (shmd->shm_vpage[an_idx] & DISM_PG_LOCKED) {
                anon_array_enter(amp, an_idx, &cookie);
                ap = anon_get_ptr(amp->ahp, an_idx);
                ASSERT(ap);
                ASSERT(sptd->spt_ppa_lckcnt[an_idx] > 0);

                swap_xlate(ap, &vp, &off);
                anon_array_exit(&cookie);
                pp = page_lookup(vp, off, SE_SHARED);
                ASSERT(pp);
                /*
                 * the availrmem is decremented only for
                 * pages which are not in seg pcache,
                 * for pages in seg pcache availrmem was
                 * decremented in _dismpagelock() (if
                 * they were not locked here)
                 */
                kernel = (sptd->spt_ppa &&
                    sptd->spt_ppa[an_idx]) ? 1 : 0;
                page_pp_unlock(pp, 0, kernel);
                page_unlock(pp);
                shmd->shm_vpage[an_idx] &= ~DISM_PG_LOCKED;
                sptd->spt_ppa_lckcnt[an_idx]--;
                shmd->shm_lckpgs--;
            }
        }
        ANON_LOCK_EXIT(&amp->a_rwlock);
        if (sptd->spt_ppa != NULL)
            sptd->spt_flags |= DISM_PPA_CHANGED;
        mutex_exit(&sptd->spt_lock);
    }
    return (sts);
}

/*ARGSUSED*/
int
segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
    spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * ISM segment is always rw.
     */
    while (--pgno >= 0)
        *protv++ = sptd->spt_prot;
    return (0);
}

/*ARGSUSED*/
u_offset_t
segspt_shmgetoffset(struct seg *seg, caddr_t addr)
{
    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /* Offset does not matter in ISM memory */

    return ((u_offset_t)0);
}

/* ARGSUSED */
int
segspt_shmgettype(struct seg *seg, caddr_t addr)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    /*
     * The shared memory mapping is always MAP_SHARED, SWAP is only
     * reserved for DISM
     */
    return (MAP_SHARED |
        ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
}

/*ARGSUSED*/
int
segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    *vpp = sptd->spt_vp;
    return (0);
}

/*ARGSUSED*/
static int
segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
{
    struct shm_data     *shmd = (struct shm_data *)seg->s_data;
    struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
    struct anon_map *amp;
    pgcnt_t     pg_idx;

    ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));

    if (behav == MADV_FREE) {
        if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
            return (0);

        amp = sptd->spt_amp;
        pg_idx = seg_page(seg, addr);

        mutex_enter(&sptd->spt_lock);
        if (sptd->spt_ppa != NULL)
            sptd->spt_flags |= DISM_PPA_CHANGED;
        mutex_exit(&sptd->spt_lock);

        /*
         * Purge all DISM cached pages
         */
        seg_ppurge_seg(segspt_reclaim);

        mutex_enter(&sptd->spt_lock);
        ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
        anon_disclaim(amp, pg_idx, len, ANON_PGLOOKUP_BLK);
        ANON_LOCK_EXIT(&amp->a_rwlock);
        mutex_exit(&sptd->spt_lock);
    } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
        behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
        int         already_set;
        ulong_t         anon_index;
        lgrp_mem_policy_t   policy;
        caddr_t         shm_addr;
        size_t          share_size;
        size_t          size;
        struct seg      *sptseg = shmd->shm_sptseg;
        caddr_t         sptseg_addr;

        /*
         * Align address and length to page size of underlying segment
         */
        share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
        shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
        size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
            share_size);

        amp = shmd->shm_amp;
        anon_index = seg_page(seg, shm_addr);

        /*
         * And now we may have to adjust size downward if we have
         * exceeded the realsize of the segment or initial anon
         * allocations.
         */
        sptseg_addr = sptseg->s_base + ptob(anon_index);
        if ((sptseg_addr + size) >
            (sptseg->s_base + sptd->spt_realsize))
            size = (sptseg->s_base + sptd->spt_realsize) -
                sptseg_addr;

        /*
         * Set memory allocation policy for this segment
         */
        policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
        already_set = lgrp_shm_policy_set(policy, amp, anon_index,
            NULL, 0, len);

        /*
         * If random memory allocation policy set already,
         * don't bother reapplying it.
         */
        if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
            return (0);

        /*
         * Mark any existing pages in the given range for
         * migration, flushing the I/O page cache, and using
         * underlying segment to calculate anon index and get
         * anonmap and vnode pointer from
         */
        if (shmd->shm_softlockcnt > 0)
            segspt_purge(seg);

        page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
    }

    return (0);
}

/*ARGSUSED*/
void
segspt_shmdump(struct seg *seg)
{
    /* no-op for ISM segment */
}

/*ARGSUSED*/
static faultcode_t
segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
{
    return (ENOTSUP);
}

/*
 * get a memory ID for an addr in a given segment
 */
static int
segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
{
    struct shm_data *shmd = (struct shm_data *)seg->s_data;
    struct anon     *ap;
    size_t      anon_index;
    struct anon_map *amp = shmd->shm_amp;
    struct spt_data *sptd = shmd->shm_sptseg->s_data;
    struct seg  *sptseg = shmd->shm_sptseg;
    anon_sync_obj_t cookie;

    anon_index = seg_page(seg, addr);

    if (addr > (seg->s_base + sptd->spt_realsize)) {
        return (EFAULT);
    }

    ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
    anon_array_enter(amp, anon_index, &cookie);
    ap = anon_get_ptr(amp->ahp, anon_index);
    if (ap == NULL) {
        struct page *pp;
        caddr_t spt_addr = sptseg->s_base + ptob(anon_index);

        pp = anon_zero(sptseg, spt_addr, &ap, kcred);
        if (pp == NULL) {
            anon_array_exit(&cookie);
            ANON_LOCK_EXIT(&amp->a_rwlock);
            return (ENOMEM);
        }
        (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
        page_unlock(pp);
    }
    anon_array_exit(&cookie);
    ANON_LOCK_EXIT(&amp->a_rwlock);
    memidp->val[0] = (uintptr_t)ap;
    memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
    return (0);
}

/*
 * Get memory allocation policy info for specified address in given segment
 */
static lgrp_mem_policy_info_t *
segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
{
    struct anon_map     *amp;
    ulong_t         anon_index;
    lgrp_mem_policy_info_t  *policy_info;
    struct shm_data     *shm_data;

    ASSERT(seg != NULL);

    /*
     * Get anon_map from segshm
     *
     * Assume that no lock needs to be held on anon_map, since
     * it should be protected by its reference count which must be
     * nonzero for an existing segment
     * Need to grab readers lock on policy tree though
     */
    shm_data = (struct shm_data *)seg->s_data;
    if (shm_data == NULL)
        return (NULL);
    amp = shm_data->shm_amp;
    ASSERT(amp->refcnt != 0);

    /*
     * Get policy info
     *
     * Assume starting anon index of 0
     */
    anon_index = seg_page(seg, addr);
    policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);

    return (policy_info);
}

/*ARGSUSED*/
static int
segspt_shmcapable(struct seg *seg, segcapability_t capability)
{
    return (0);
}