seg_spt.c revision 1bd5c35fd400f7f19eee9efd795c32cedb602b06
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/param.h>
#include <sys/user.h>
#include <sys/mman.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/cmn_err.h>
#include <sys/systm.h>
#include <sys/tuneable.h>
#include <vm/hat.h>
#include <vm/seg.h>
#include <vm/as.h>
#include <vm/anon.h>
#include <vm/page.h>
#include <sys/buf.h>
#include <sys/swap.h>
#include <sys/atomic.h>
#include <vm/seg_spt.h>
#include <sys/debug.h>
#include <sys/vtrace.h>
#include <sys/shm.h>
#include <sys/lgrp.h>
#include <sys/vmsystm.h>
#include <sys/tnf_probe.h>
#define SEGSPTADDR (caddr_t)0x0
/*
* # pages used for spt
*/
static size_t spt_used;
/*
* segspt_minfree is the memory left for system after ISM
* locked its pages; it is set up to 5% of availrmem in
* sptcreate when ISM is created. ISM should not use more
* than ~90% of availrmem; if it does, then the performance
* of the system may decrease. Machines with large memories may
* be able to use up more memory for ISM so we set the default
* segspt_minfree to 5% (which gives ISM max 95% of availrmem.
* If somebody wants even more memory for ISM (risking hanging
* the system) they can patch the segspt_minfree to smaller number.
*/
pgcnt_t segspt_minfree = 0;
static int segspt_create(struct seg *seg, caddr_t argsp);
static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
static void segspt_free(struct seg *seg);
static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
static void
segspt_badop()
{
panic("segspt_badop called");
/*NOTREACHED*/
}
#define SEGSPT_BADOP(t) (t(*)())segspt_badop
struct seg_ops segspt_ops = {
SEGSPT_BADOP(int), /* dup */
segspt_unmap,
segspt_free,
SEGSPT_BADOP(int), /* fault */
SEGSPT_BADOP(faultcode_t), /* faulta */
SEGSPT_BADOP(int), /* setprot */
SEGSPT_BADOP(int), /* checkprot */
SEGSPT_BADOP(int), /* kluster */
SEGSPT_BADOP(size_t), /* swapout */
SEGSPT_BADOP(int), /* sync */
SEGSPT_BADOP(size_t), /* incore */
SEGSPT_BADOP(int), /* lockop */
SEGSPT_BADOP(int), /* getprot */
SEGSPT_BADOP(u_offset_t), /* getoffset */
SEGSPT_BADOP(int), /* gettype */
SEGSPT_BADOP(int), /* getvp */
SEGSPT_BADOP(int), /* advise */
SEGSPT_BADOP(void), /* dump */
SEGSPT_BADOP(int), /* pagelock */
SEGSPT_BADOP(int), /* setpgsz */
SEGSPT_BADOP(int), /* getmemid */
segspt_getpolicy, /* getpolicy */
SEGSPT_BADOP(int), /* capable */
};
static int segspt_shmdup(struct seg *seg, struct seg *newseg);
static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
static void segspt_shmfree(struct seg *seg);
static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
register size_t len, register uint_t prot);
static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
uint_t prot);
static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
static size_t segspt_shmswapout(struct seg *seg);
static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
register char *vec);
static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
int attr, uint_t flags);
static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
int attr, int op, ulong_t *lockmap, size_t pos);
static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
uint_t *protv);
static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
static int segspt_shmgettype(struct seg *seg, caddr_t addr);
static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
uint_t behav);
static void segspt_shmdump(struct seg *seg);
static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
struct page ***, enum lock_type, enum seg_rw);
static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
static int segspt_shmcapable(struct seg *, segcapability_t);
struct seg_ops segspt_shmops = {
segspt_shmdup,
segspt_shmunmap,
segspt_shmfree,
segspt_shmfault,
segspt_shmfaulta,
segspt_shmsetprot,
segspt_shmcheckprot,
segspt_shmkluster,
segspt_shmswapout,
segspt_shmsync,
segspt_shmincore,
segspt_shmlockop,
segspt_shmgetprot,
segspt_shmgetoffset,
segspt_shmgettype,
segspt_shmgetvp,
segspt_shmadvise, /* advise */
segspt_shmdump,
segspt_shmpagelock,
segspt_shmsetpgsz,
segspt_shmgetmemid,
segspt_shmgetpolicy,
segspt_shmcapable,
};
static void segspt_purge(struct seg *seg);
static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
enum seg_rw);
static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
page_t **ppa);
/*ARGSUSED*/
int
sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
uint_t prot, uint_t flags, uint_t share_szc)
{
int err;
struct as *newas;
struct segspt_crargs sptcargs;
#ifdef DEBUG
TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
tnf_ulong, size, size );
#endif
if (segspt_minfree == 0) /* leave min 5% of availrmem for */
segspt_minfree = availrmem/20; /* for the system */
if (!hat_supported(HAT_SHARED_PT, (void *)0))
return (EINVAL);
/*
* get a new as for this shared memory segment
*/
newas = as_alloc();
sptcargs.amp = amp;
sptcargs.prot = prot;
sptcargs.flags = flags;
sptcargs.szc = share_szc;
/*
* create a shared page table (spt) segment
*/
if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
as_free(newas);
return (err);
}
*sptseg = sptcargs.seg_spt;
return (0);
}
void
sptdestroy(struct as *as, struct anon_map *amp)
{
#ifdef DEBUG
TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
#endif
(void) as_unmap(as, SEGSPTADDR, amp->size);
as_free(as);
}
/*
* called from seg_free().
* free (i.e., unlock, unmap, return to free list)
* all the pages in the given seg.
*/
void
segspt_free(struct seg *seg)
{
struct spt_data *sptd = (struct spt_data *)seg->s_data;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
if (sptd != NULL) {
if (sptd->spt_realsize)
segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
if (sptd->spt_ppa_lckcnt)
kmem_free(sptd->spt_ppa_lckcnt,
sizeof (*sptd->spt_ppa_lckcnt)
* btopr(sptd->spt_amp->size));
kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
mutex_destroy(&sptd->spt_lock);
kmem_free(sptd, sizeof (*sptd));
}
}
/*ARGSUSED*/
static int
segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
uint_t flags)
{
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
return (0);
}
/*ARGSUSED*/
static size_t
segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
{
caddr_t eo_seg;
pgcnt_t npages;
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg;
struct spt_data *sptd;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
#ifdef lint
seg = seg;
#endif
sptseg = shmd->shm_sptseg;
sptd = sptseg->s_data;
if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
eo_seg = addr + len;
while (addr < eo_seg) {
/* page exists, and it's locked. */
*vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
SEG_PAGE_ANON;
addr += PAGESIZE;
}
return (len);
} else {
struct anon_map *amp = shmd->shm_amp;
struct anon *ap;
page_t *pp;
pgcnt_t anon_index;
struct vnode *vp;
u_offset_t off;
ulong_t i;
int ret;
anon_sync_obj_t cookie;
addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
anon_index = seg_page(seg, addr);
npages = btopr(len);
if (anon_index + npages > btopr(shmd->shm_amp->size)) {
return (EINVAL);
}
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
for (i = 0; i < npages; i++, anon_index++) {
ret = 0;
anon_array_enter(amp, anon_index, &cookie);
ap = anon_get_ptr(amp->ahp, anon_index);
if (ap != NULL) {
swap_xlate(ap, &vp, &off);
anon_array_exit(&cookie);
pp = page_lookup_nowait(vp, off, SE_SHARED);
if (pp != NULL) {
ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
page_unlock(pp);
}
} else {
anon_array_exit(&cookie);
}
if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
ret |= SEG_PAGE_LOCKED;
}
*vec++ = (char)ret;
}
ANON_LOCK_EXIT(&amp->a_rwlock);
return (len);
}
}
static int
segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
{
size_t share_size;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* seg.s_size may have been rounded up to the largest page size
* in shmat().
* XXX This should be cleanedup. sptdestroy should take a length
* argument which should be the same as sptcreate. Then
* this rounding would not be needed (or is done in shm.c)
* Only the check for full segment will be needed.
*
* XXX -- shouldn't raddr == 0 always? These tests don't seem
* to be useful at all.
*/
share_size = page_get_pagesize(seg->s_szc);
ssize = P2ROUNDUP(ssize, share_size);
if (raddr == seg->s_base && ssize == seg->s_size) {
seg_free(seg);
return (0);
} else
return (EINVAL);
}
int
segspt_create(struct seg *seg, caddr_t argsp)
{
int err;
caddr_t addr = seg->s_base;
struct spt_data *sptd;
struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
struct anon_map *amp = sptcargs->amp;
struct cred *cred = CRED();
ulong_t i, j, anon_index = 0;
pgcnt_t npages = btopr(amp->size);
struct vnode *vp;
page_t **ppa;
uint_t hat_flags;
/*
* We are holding the a_lock on the underlying dummy as,
* so we can make calls to the HAT layer.
*/
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
#ifdef DEBUG
TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
tnf_opaque, addr, addr,
tnf_ulong, len, seg->s_size);
#endif
if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
if (err = anon_swap_adjust(npages))
return (err);
}
err = ENOMEM;
if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
goto out1;
if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
KM_NOSLEEP)) == NULL)
goto out2;
}
mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
goto out3;
seg->s_ops = &segspt_ops;
sptd->spt_vp = vp;
sptd->spt_amp = amp;
sptd->spt_prot = sptcargs->prot;
sptd->spt_flags = sptcargs->flags;
seg->s_data = (caddr_t)sptd;
sptd->spt_ppa = NULL;
sptd->spt_ppa_lckcnt = NULL;
seg->s_szc = sptcargs->szc;
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
amp->a_szc = seg->s_szc;
ANON_LOCK_EXIT(&amp->a_rwlock);
/*
* Set policy to affect initial allocation of pages in
* anon_map_createpages()
*/
(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
NULL, 0, ptob(npages));
if (sptcargs->flags & SHM_PAGEABLE) {
size_t share_sz;
pgcnt_t new_npgs, more_pgs;
struct anon_hdr *nahp;
share_sz = page_get_pagesize(seg->s_szc);
if (!IS_P2ALIGNED(amp->size, share_sz)) {
/*
* We are rounding up the size of the anon array
* on 4 M boundary because we always create 4 M
* of page(s) when locking, faulting pages and we
* don't have to check for all corner cases e.g.
* if there is enough space to allocate 4 M
* page.
*/
new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
more_pgs = new_npgs - npages;
if (anon_resv(ptob(more_pgs)) == 0) {
err = ENOMEM;
goto out4;
}
nahp = anon_create(new_npgs, ANON_SLEEP);
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
(void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
ANON_SLEEP);
anon_release(amp->ahp, npages);
amp->ahp = nahp;
amp->swresv = amp->size = ptob(new_npgs);
ANON_LOCK_EXIT(&amp->a_rwlock);
npages = new_npgs;
}
sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
sptd->spt_pcachecnt = 0;
sptd->spt_realsize = ptob(npages);
sptcargs->seg_spt = seg;
return (0);
}
/*
* get array of pages for each anon slot in amp
*/
if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
seg, addr, S_CREATE, cred)) != 0)
goto out4;
/*
* addr is initial address corresponding to the first page on ppa list
*/
for (i = 0; i < npages; i++) {
/* attempt to lock all pages */
if (!page_pp_lock(ppa[i], 0, 1)) {
/*
* if unable to lock any page, unlock all
* of them and return error
*/
for (j = 0; j < i; j++)
page_pp_unlock(ppa[j], 0, 1);
for (i = 0; i < npages; i++) {
page_unlock(ppa[i]);
}
err = ENOMEM;
goto out4;
}
}
/*
* Some platforms assume that ISM mappings are HAT_LOAD_LOCK
* for the entire life of the segment. For example platforms
* that do not support Dynamic Reconfiguration.
*/
hat_flags = HAT_LOAD_SHARE;
if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
hat_flags |= HAT_LOAD_LOCK;
hat_memload_array(seg->s_as->a_hat, addr, ptob(npages),
ppa, sptd->spt_prot, hat_flags);
/*
* On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
* we will leave the pages locked SE_SHARED for the life
* of the ISM segment. This will prevent any calls to
* hat_pageunload() on this ISM segment for those platforms.
*/
if (!(hat_flags & HAT_LOAD_LOCK)) {
/*
* On platforms that support HAT_DYNAMIC_ISM_UNMAP,
* we no longer need to hold the SE_SHARED lock on the pages,
* since L_PAGELOCK and F_SOFTLOCK calls will grab the
* SE_SHARED lock on the pages as necessary.
*/
for (i = 0; i < npages; i++)
page_unlock(ppa[i]);
}
sptd->spt_pcachecnt = 0;
kmem_free(ppa, ((sizeof (page_t *)) * npages));
sptd->spt_realsize = ptob(npages);
atomic_add_long(&spt_used, npages);
sptcargs->seg_spt = seg;
return (0);
out4:
seg->s_data = NULL;
kmem_free(vp, sizeof (*vp));
out3:
mutex_destroy(&sptd->spt_lock);
if ((sptcargs->flags & SHM_PAGEABLE) == 0)
kmem_free(ppa, (sizeof (*ppa) * npages));
out2:
kmem_free(sptd, sizeof (*sptd));
out1:
if ((sptcargs->flags & SHM_PAGEABLE) == 0)
anon_swap_restore(npages);
return (err);
}
/*ARGSUSED*/
void
segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
{
struct page *pp;
struct spt_data *sptd = (struct spt_data *)seg->s_data;
pgcnt_t npages;
ulong_t anon_idx;
struct anon_map *amp;
struct anon *ap;
struct vnode *vp;
u_offset_t off;
uint_t hat_flags;
int root = 0;
pgcnt_t pgs, curnpgs = 0;
page_t *rootpp;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
len = P2ROUNDUP(len, PAGESIZE);
npages = btop(len);
hat_flags = HAT_UNLOAD_UNLOCK;
if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
(sptd->spt_flags & SHM_PAGEABLE)) {
hat_flags = HAT_UNLOAD;
}
hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
amp = sptd->spt_amp;
if (sptd->spt_flags & SHM_PAGEABLE)
npages = btop(amp->size);
ASSERT(amp);
for (anon_idx = 0; anon_idx < npages; anon_idx++) {
if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
panic("segspt_free_pages: null app");
/*NOTREACHED*/
}
} else {
if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
== NULL)
continue;
}
ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
swap_xlate(ap, &vp, &off);
/*
* If this platform supports HAT_DYNAMIC_ISM_UNMAP,
* the pages won't be having SE_SHARED lock at this
* point.
*
* On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
* the pages are still held SE_SHARED locked from the
* original segspt_create()
*
* Our goal is to get SE_EXCL lock on each page, remove
* permanent lock on it and invalidate the page.
*/
if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
if (hat_flags == HAT_UNLOAD)
pp = page_lookup(vp, off, SE_EXCL);
else {
if ((pp = page_find(vp, off)) == NULL) {
panic("segspt_free_pages: "
"page not locked");
/*NOTREACHED*/
}
if (!page_tryupgrade(pp)) {
page_unlock(pp);
pp = page_lookup(vp, off, SE_EXCL);
}
}
if (pp == NULL) {
panic("segspt_free_pages: "
"page not in the system");
/*NOTREACHED*/
}
page_pp_unlock(pp, 0, 1);
} else {
if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
continue;
page_pp_unlock(pp, 0, 0);
}
/*
* It's logical to invalidate the pages here as in most cases
* these were created by segspt.
*/
if (pp->p_szc != 0) {
/*
* For DISM swap is released in shm_rm_amp.
*/
if ((sptd->spt_flags & SHM_PAGEABLE) == 0 &&
ap->an_pvp != NULL) {
panic("segspt_free_pages: pvp non NULL");
/*NOTREACHED*/
}
if (root == 0) {
ASSERT(curnpgs == 0);
root = 1;
rootpp = pp;
pgs = curnpgs = page_get_pagecnt(pp->p_szc);
ASSERT(pgs > 1);
ASSERT(IS_P2ALIGNED(pgs, pgs));
ASSERT(!(page_pptonum(pp) & (pgs - 1)));
curnpgs--;
} else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
ASSERT(curnpgs == 1);
ASSERT(page_pptonum(pp) ==
page_pptonum(rootpp) + (pgs - 1));
page_destroy_pages(rootpp);
root = 0;
curnpgs = 0;
} else {
ASSERT(curnpgs > 1);
ASSERT(page_pptonum(pp) ==
page_pptonum(rootpp) + (pgs - curnpgs));
curnpgs--;
}
} else {
if (root != 0 || curnpgs != 0) {
panic("segspt_free_pages: bad large page");
/*NOTREACHED*/
}
/*LINTED: constant in conditional context */
VN_DISPOSE(pp, B_INVAL, 0, kcred);
}
}
if (root != 0 || curnpgs != 0) {
panic("segspt_free_pages: bad large page");
/*NOTREACHED*/
}
/*
* mark that pages have been released
*/
sptd->spt_realsize = 0;
if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
atomic_add_long(&spt_used, -npages);
anon_swap_restore(npages);
}
}
/*
* Get memory allocation policy info for specified address in given segment
*/
static lgrp_mem_policy_info_t *
segspt_getpolicy(struct seg *seg, caddr_t addr)
{
struct anon_map *amp;
ulong_t anon_index;
lgrp_mem_policy_info_t *policy_info;
struct spt_data *spt_data;
ASSERT(seg != NULL);
/*
* Get anon_map from segspt
*
* Assume that no lock needs to be held on anon_map, since
* it should be protected by its reference count which must be
* nonzero for an existing segment
* Need to grab readers lock on policy tree though
*/
spt_data = (struct spt_data *)seg->s_data;
if (spt_data == NULL)
return (NULL);
amp = spt_data->spt_amp;
ASSERT(amp->refcnt != 0);
/*
* Get policy info
*
* Assume starting anon index of 0
*/
anon_index = seg_page(seg, addr);
policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
return (policy_info);
}
/*
* DISM only.
* Return locked pages over a given range.
*
* We will cache all DISM locked pages and save the pplist for the
* entire segment in the ppa field of the underlying DISM segment structure.
* Later, during a call to segspt_reclaim() we will use this ppa array
* to page_unlock() all of the pages and then we will free this ppa list.
*/
/*ARGSUSED*/
static int
segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
struct page ***ppp, enum lock_type type, enum seg_rw rw)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg = shmd->shm_sptseg;
struct spt_data *sptd = sptseg->s_data;
pgcnt_t pg_idx, npages, tot_npages, npgs;
struct page **pplist, **pl, **ppa, *pp;
struct anon_map *amp;
spgcnt_t an_idx;
int ret = ENOTSUP;
uint_t pl_built = 0;
struct anon *ap;
struct vnode *vp;
u_offset_t off;
pgcnt_t claim_availrmem = 0;
uint_t szc;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* We want to lock/unlock the entire ISM segment. Therefore,
* we will be using the underlying sptseg and it's base address
* and length for the caching arguments.
*/
ASSERT(sptseg);
ASSERT(sptd);
pg_idx = seg_page(seg, addr);
npages = btopr(len);
/*
* check if the request is larger than number of pages covered
* by amp
*/
if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
*ppp = NULL;
return (ENOTSUP);
}
if (type == L_PAGEUNLOCK) {
ASSERT(sptd->spt_ppa != NULL);
seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
/*
* If someone is blocked while unmapping, we purge
* segment page cache and thus reclaim pplist synchronously
* without waiting for seg_pasync_thread. This speeds up
* unmapping in cases where munmap(2) is called, while
* raw async i/o is still in progress or where a thread
* exits on data fault in a multithreaded application.
*/
if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
segspt_purge(seg);
}
return (0);
} else if (type == L_PAGERECLAIM) {
ASSERT(sptd->spt_ppa != NULL);
(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_ppa, sptd->spt_prot);
return (0);
}
if (sptd->spt_flags & DISM_PPA_CHANGED) {
segspt_purge(seg);
/*
* for DISM ppa needs to be rebuild since
* number of locked pages could be changed
*/
*ppp = NULL;
return (ENOTSUP);
}
/*
* First try to find pages in segment page cache, without
* holding the segment lock.
*/
pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_prot);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa != NULL);
ASSERT(sptd->spt_ppa == pplist);
ppa = sptd->spt_ppa;
for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
if (ppa[an_idx] == NULL) {
seg_pinactive(seg, seg->s_base,
sptd->spt_amp->size, ppa,
sptd->spt_prot, segspt_reclaim);
*ppp = NULL;
return (ENOTSUP);
}
if ((szc = ppa[an_idx]->p_szc) != 0) {
npgs = page_get_pagecnt(szc);
an_idx = P2ROUNDUP(an_idx + 1, npgs);
} else {
an_idx++;
}
}
/*
* Since we cache the entire DISM segment, we want to
* set ppp to point to the first slot that corresponds
* to the requested addr, i.e. pg_idx.
*/
*ppp = &(sptd->spt_ppa[pg_idx]);
return (0);
}
/* The L_PAGELOCK case... */
mutex_enter(&sptd->spt_lock);
/*
* try to find pages in segment page cache with mutex
*/
pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_prot);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa != NULL);
ASSERT(sptd->spt_ppa == pplist);
ppa = sptd->spt_ppa;
for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
if (ppa[an_idx] == NULL) {
mutex_exit(&sptd->spt_lock);
seg_pinactive(seg, seg->s_base,
sptd->spt_amp->size, ppa,
sptd->spt_prot, segspt_reclaim);
*ppp = NULL;
return (ENOTSUP);
}
if ((szc = ppa[an_idx]->p_szc) != 0) {
npgs = page_get_pagecnt(szc);
an_idx = P2ROUNDUP(an_idx + 1, npgs);
} else {
an_idx++;
}
}
/*
* Since we cache the entire DISM segment, we want to
* set ppp to point to the first slot that corresponds
* to the requested addr, i.e. pg_idx.
*/
mutex_exit(&sptd->spt_lock);
*ppp = &(sptd->spt_ppa[pg_idx]);
return (0);
}
if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
SEGP_FAIL) {
mutex_exit(&sptd->spt_lock);
*ppp = NULL;
return (ENOTSUP);
}
/*
* No need to worry about protections because DISM pages are always rw.
*/
pl = pplist = NULL;
amp = sptd->spt_amp;
/*
* Do we need to build the ppa array?
*/
if (sptd->spt_ppa == NULL) {
pgcnt_t lpg_cnt = 0;
pl_built = 1;
tot_npages = btopr(sptd->spt_amp->size);
ASSERT(sptd->spt_pcachecnt == 0);
pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
pl = pplist;
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
for (an_idx = 0; an_idx < tot_npages; ) {
ap = anon_get_ptr(amp->ahp, an_idx);
/*
* Cache only mlocked pages. For large pages
* if one (constituent) page is mlocked
* all pages for that large page
* are cached also. This is for quick
* lookups of ppa array;
*/
if ((ap != NULL) && (lpg_cnt != 0 ||
(sptd->spt_ppa_lckcnt[an_idx] != 0))) {
swap_xlate(ap, &vp, &off);
pp = page_lookup(vp, off, SE_SHARED);
ASSERT(pp != NULL);
if (lpg_cnt == 0) {
npgs = page_get_pagecnt(pp->p_szc);
if (!IS_P2ALIGNED(an_idx, npgs)) {
an_idx = P2ALIGN(an_idx, npgs);
page_unlock(pp);
continue;
}
}
if (++lpg_cnt == npgs)
lpg_cnt = 0;
/*
* availrmem is decremented only
* for unlocked pages
*/
if (sptd->spt_ppa_lckcnt[an_idx] == 0)
claim_availrmem++;
pplist[an_idx] = pp;
}
an_idx++;
}
ANON_LOCK_EXIT(&amp->a_rwlock);
mutex_enter(&freemem_lock);
if (availrmem < tune.t_minarmem + claim_availrmem) {
mutex_exit(&freemem_lock);
ret = FC_MAKE_ERR(ENOMEM);
claim_availrmem = 0;
goto insert_fail;
} else {
availrmem -= claim_availrmem;
}
mutex_exit(&freemem_lock);
sptd->spt_ppa = pl;
} else {
/*
* We already have a valid ppa[].
*/
pl = sptd->spt_ppa;
}
ASSERT(pl != NULL);
ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH,
segspt_reclaim);
if (ret == SEGP_FAIL) {
/*
* seg_pinsert failed. We return
* ENOTSUP, so that the as_pagelock() code will
* then try the slower F_SOFTLOCK path.
*/
sptd->spt_ppa = NULL;
ret = ENOTSUP;
goto insert_fail;
}
/*
* In either case, we increment softlockcnt on the 'real' segment.
*/
sptd->spt_pcachecnt++;
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
ppa = sptd->spt_ppa;
for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
if (ppa[an_idx] == NULL) {
mutex_exit(&sptd->spt_lock);
seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
pl, sptd->spt_prot, segspt_reclaim);
*ppp = NULL;
return (ENOTSUP);
}
if ((szc = ppa[an_idx]->p_szc) != 0) {
npgs = page_get_pagecnt(szc);
an_idx = P2ROUNDUP(an_idx + 1, npgs);
} else {
an_idx++;
}
}
/*
* We can now drop the sptd->spt_lock since the ppa[]
* exists and he have incremented pacachecnt.
*/
mutex_exit(&sptd->spt_lock);
/*
* Since we cache the entire segment, we want to
* set ppp to point to the first slot that corresponds
* to the requested addr, i.e. pg_idx.
*/
*ppp = &(sptd->spt_ppa[pg_idx]);
return (ret);
insert_fail:
/*
* We will only reach this code if we tried and failed.
*
* And we can drop the lock on the dummy seg, once we've failed
* to set up a new ppa[].
*/
mutex_exit(&sptd->spt_lock);
if (pl_built) {
mutex_enter(&freemem_lock);
availrmem += claim_availrmem;
mutex_exit(&freemem_lock);
/*
* We created pl and we need to destroy it.
*/
pplist = pl;
for (an_idx = 0; an_idx < tot_npages; an_idx++) {
if (pplist[an_idx] != NULL)
page_unlock(pplist[an_idx]);
}
kmem_free(pl, sizeof (page_t *) * tot_npages);
}
if (shmd->shm_softlockcnt <= 0) {
if (AS_ISUNMAPWAIT(seg->s_as)) {
mutex_enter(&seg->s_as->a_contents);
if (AS_ISUNMAPWAIT(seg->s_as)) {
AS_CLRUNMAPWAIT(seg->s_as);
cv_broadcast(&seg->s_as->a_cv);
}
mutex_exit(&seg->s_as->a_contents);
}
}
*ppp = NULL;
return (ret);
}
/*
* return locked pages over a given range.
*
* We will cache the entire ISM segment and save the pplist for the
* entire segment in the ppa field of the underlying ISM segment structure.
* Later, during a call to segspt_reclaim() we will use this ppa array
* to page_unlock() all of the pages and then we will free this ppa list.
*/
/*ARGSUSED*/
static int
segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
struct page ***ppp, enum lock_type type, enum seg_rw rw)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg = shmd->shm_sptseg;
struct spt_data *sptd = sptseg->s_data;
pgcnt_t np, page_index, npages;
caddr_t a, spt_base;
struct page **pplist, **pl, *pp;
struct anon_map *amp;
ulong_t anon_index;
int ret = ENOTSUP;
uint_t pl_built = 0;
struct anon *ap;
struct vnode *vp;
u_offset_t off;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* We want to lock/unlock the entire ISM segment. Therefore,
* we will be using the underlying sptseg and it's base address
* and length for the caching arguments.
*/
ASSERT(sptseg);
ASSERT(sptd);
if (sptd->spt_flags & SHM_PAGEABLE) {
return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
}
page_index = seg_page(seg, addr);
npages = btopr(len);
/*
* check if the request is larger than number of pages covered
* by amp
*/
if (page_index + npages > btopr(sptd->spt_amp->size)) {
*ppp = NULL;
return (ENOTSUP);
}
if (type == L_PAGEUNLOCK) {
ASSERT(sptd->spt_ppa != NULL);
seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
/*
* If someone is blocked while unmapping, we purge
* segment page cache and thus reclaim pplist synchronously
* without waiting for seg_pasync_thread. This speeds up
* unmapping in cases where munmap(2) is called, while
* raw async i/o is still in progress or where a thread
* exits on data fault in a multithreaded application.
*/
if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
segspt_purge(seg);
}
return (0);
} else if (type == L_PAGERECLAIM) {
ASSERT(sptd->spt_ppa != NULL);
(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_ppa, sptd->spt_prot);
return (0);
}
/*
* First try to find pages in segment page cache, without
* holding the segment lock.
*/
pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_prot);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa == pplist);
ASSERT(sptd->spt_ppa[page_index]);
/*
* Since we cache the entire ISM segment, we want to
* set ppp to point to the first slot that corresponds
* to the requested addr, i.e. page_index.
*/
*ppp = &(sptd->spt_ppa[page_index]);
return (0);
}
/* The L_PAGELOCK case... */
mutex_enter(&sptd->spt_lock);
/*
* try to find pages in segment page cache
*/
pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
sptd->spt_prot);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa == pplist);
/*
* Since we cache the entire segment, we want to
* set ppp to point to the first slot that corresponds
* to the requested addr, i.e. page_index.
*/
mutex_exit(&sptd->spt_lock);
*ppp = &(sptd->spt_ppa[page_index]);
return (0);
}
if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
SEGP_FAIL) {
mutex_exit(&sptd->spt_lock);
*ppp = NULL;
return (ENOTSUP);
}
/*
* No need to worry about protections because ISM pages
* are always rw.
*/
pl = pplist = NULL;
/*
* Do we need to build the ppa array?
*/
if (sptd->spt_ppa == NULL) {
ASSERT(sptd->spt_ppa == pplist);
spt_base = sptseg->s_base;
pl_built = 1;
/*
* availrmem is decremented once during anon_swap_adjust()
* and is incremented during the anon_unresv(), which is
* called from shm_rm_amp() when the segment is destroyed.
*/
amp = sptd->spt_amp;
ASSERT(amp != NULL);
/* pcachecnt is protected by sptd->spt_lock */
ASSERT(sptd->spt_pcachecnt == 0);
pplist = kmem_zalloc(sizeof (page_t *)
* btopr(sptd->spt_amp->size), KM_SLEEP);
pl = pplist;
anon_index = seg_page(sptseg, spt_base);
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
a += PAGESIZE, anon_index++, pplist++) {
ap = anon_get_ptr(amp->ahp, anon_index);
ASSERT(ap != NULL);
swap_xlate(ap, &vp, &off);
pp = page_lookup(vp, off, SE_SHARED);
ASSERT(pp != NULL);
*pplist = pp;
}
ANON_LOCK_EXIT(&amp->a_rwlock);
if (a < (spt_base + sptd->spt_amp->size)) {
ret = ENOTSUP;
goto insert_fail;
}
sptd->spt_ppa = pl;
} else {
/*
* We already have a valid ppa[].
*/
pl = sptd->spt_ppa;
}
ASSERT(pl != NULL);
ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim);
if (ret == SEGP_FAIL) {
/*
* seg_pinsert failed. We return
* ENOTSUP, so that the as_pagelock() code will
* then try the slower F_SOFTLOCK path.
*/
if (pl_built) {
/*
* No one else has referenced the ppa[].
* We created it and we need to destroy it.
*/
sptd->spt_ppa = NULL;
}
ret = ENOTSUP;
goto insert_fail;
}
/*
* In either case, we increment softlockcnt on the 'real' segment.
*/
sptd->spt_pcachecnt++;
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
/*
* We can now drop the sptd->spt_lock since the ppa[]
* exists and he have incremented pacachecnt.
*/
mutex_exit(&sptd->spt_lock);
/*
* Since we cache the entire segment, we want to
* set ppp to point to the first slot that corresponds
* to the requested addr, i.e. page_index.
*/
*ppp = &(sptd->spt_ppa[page_index]);
return (ret);
insert_fail:
/*
* We will only reach this code if we tried and failed.
*
* And we can drop the lock on the dummy seg, once we've failed
* to set up a new ppa[].
*/
mutex_exit(&sptd->spt_lock);
if (pl_built) {
/*
* We created pl and we need to destroy it.
*/
pplist = pl;
np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
while (np) {
page_unlock(*pplist);
np--;
pplist++;
}
kmem_free(pl, sizeof (page_t *) *
btopr(sptd->spt_amp->size));
}
if (shmd->shm_softlockcnt <= 0) {
if (AS_ISUNMAPWAIT(seg->s_as)) {
mutex_enter(&seg->s_as->a_contents);
if (AS_ISUNMAPWAIT(seg->s_as)) {
AS_CLRUNMAPWAIT(seg->s_as);
cv_broadcast(&seg->s_as->a_cv);
}
mutex_exit(&seg->s_as->a_contents);
}
}
*ppp = NULL;
return (ret);
}
/*
* purge any cached pages in the I/O page cache
*/
static void
segspt_purge(struct seg *seg)
{
seg_ppurge(seg);
}
static int
segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
enum seg_rw rw)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg;
struct spt_data *sptd;
pgcnt_t npages, i, free_availrmem = 0;
int done = 0;
#ifdef lint
addr = addr;
#endif
sptseg = shmd->shm_sptseg;
sptd = sptseg->s_data;
npages = (len >> PAGESHIFT);
ASSERT(npages);
ASSERT(sptd->spt_pcachecnt != 0);
ASSERT(sptd->spt_ppa == pplist);
ASSERT(npages == btopr(sptd->spt_amp->size));
/*
* Acquire the lock on the dummy seg and destroy the
* ppa array IF this is the last pcachecnt.
*/
mutex_enter(&sptd->spt_lock);
if (--sptd->spt_pcachecnt == 0) {
for (i = 0; i < npages; i++) {
if (pplist[i] == NULL) {
continue;
}
if (rw == S_WRITE) {
hat_setrefmod(pplist[i]);
} else {
hat_setref(pplist[i]);
}
if ((sptd->spt_flags & SHM_PAGEABLE) &&
(sptd->spt_ppa_lckcnt[i] == 0))
free_availrmem++;
page_unlock(pplist[i]);
}
if (sptd->spt_flags & SHM_PAGEABLE) {
mutex_enter(&freemem_lock);
availrmem += free_availrmem;
mutex_exit(&freemem_lock);
}
/*
* Since we want to cach/uncache the entire ISM segment,
* we will track the pplist in a segspt specific field
* ppa, that is initialized at the time we add an entry to
* the cache.
*/
ASSERT(sptd->spt_pcachecnt == 0);
kmem_free(pplist, sizeof (page_t *) * npages);
sptd->spt_ppa = NULL;
sptd->spt_flags &= ~DISM_PPA_CHANGED;
done = 1;
}
mutex_exit(&sptd->spt_lock);
/*
* Now decrement softlockcnt.
*/
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1);
if (shmd->shm_softlockcnt <= 0) {
if (AS_ISUNMAPWAIT(seg->s_as)) {
mutex_enter(&seg->s_as->a_contents);
if (AS_ISUNMAPWAIT(seg->s_as)) {
AS_CLRUNMAPWAIT(seg->s_as);
cv_broadcast(&seg->s_as->a_cv);
}
mutex_exit(&seg->s_as->a_contents);
}
}
return (done);
}
/*
* Do a F_SOFTUNLOCK call over the range requested.
* The range must have already been F_SOFTLOCK'ed.
*
* The calls to acquire and release the anon map lock mutex were
* removed in order to avoid a deadly embrace during a DR
* memory delete operation. (Eg. DR blocks while waiting for a
* exclusive lock on a page that is being used for kaio; the
* thread that will complete the kaio and call segspt_softunlock
* blocks on the anon map lock; another thread holding the anon
* map lock blocks on another page lock via the segspt_shmfault
* -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
*
* The appropriateness of the removal is based upon the following:
* 1. If we are holding a segment's reader lock and the page is held
* shared, then the corresponding element in anonmap which points to
* anon struct cannot change and there is no need to acquire the
* anonymous map lock.
* 2. Threads in segspt_softunlock have a reader lock on the segment
* and already have the shared page lock, so we are guaranteed that
* the anon map slot cannot change and therefore can call anon_get_ptr()
* without grabbing the anonymous map lock.
* 3. Threads that softlock a shared page break copy-on-write, even if
* its a read. Thus cow faults can be ignored with respect to soft
* unlocking, since the breaking of cow means that the anon slot(s) will
* not be shared.
*/
static void
segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
size_t len, enum seg_rw rw)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg;
struct spt_data *sptd;
page_t *pp;
caddr_t adr;
struct vnode *vp;
u_offset_t offset;
ulong_t anon_index;
struct anon_map *amp; /* XXX - for locknest */
struct anon *ap = NULL;
pgcnt_t npages;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
sptseg = shmd->shm_sptseg;
sptd = sptseg->s_data;
/*
* Some platforms assume that ISM mappings are HAT_LOAD_LOCK
* and therefore their pages are SE_SHARED locked
* for the entire life of the segment.
*/
if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
goto softlock_decrement;
}
/*
* Any thread is free to do a page_find and
* page_unlock() on the pages within this seg.
*
* We are already holding the as->a_lock on the user's
* real segment, but we need to hold the a_lock on the
* underlying dummy as. This is mostly to satisfy the
* underlying HAT layer.
*/
AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
amp = sptd->spt_amp;
ASSERT(amp != NULL);
anon_index = seg_page(sptseg, sptseg_addr);
for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
ap = anon_get_ptr(amp->ahp, anon_index++);
ASSERT(ap != NULL);
swap_xlate(ap, &vp, &offset);
/*
* Use page_find() instead of page_lookup() to
* find the page since we know that it has a
* "shared" lock.
*/
pp = page_find(vp, offset);
ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
if (pp == NULL) {
panic("segspt_softunlock: "
"addr %p, ap %p, vp %p, off %llx",
(void *)adr, (void *)ap, (void *)vp, offset);
/*NOTREACHED*/
}
if (rw == S_WRITE) {
hat_setrefmod(pp);
} else if (rw != S_OTHER) {
hat_setref(pp);
}
page_unlock(pp);
}
softlock_decrement:
npages = btopr(len);
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
if (shmd->shm_softlockcnt == 0) {
/*
* All SOFTLOCKS are gone. Wakeup any waiting
* unmappers so they can try again to unmap.
* Check for waiters first without the mutex
* held so we don't always grab the mutex on
* softunlocks.
*/
if (AS_ISUNMAPWAIT(seg->s_as)) {
mutex_enter(&seg->s_as->a_contents);
if (AS_ISUNMAPWAIT(seg->s_as)) {
AS_CLRUNMAPWAIT(seg->s_as);
cv_broadcast(&seg->s_as->a_cv);
}
mutex_exit(&seg->s_as->a_contents);
}
}
}
int
segspt_shmattach(struct seg *seg, caddr_t *argsp)
{
struct shm_data *shmd_arg = (struct shm_data *)argsp;
struct shm_data *shmd;
struct anon_map *shm_amp = shmd_arg->shm_amp;
struct spt_data *sptd;
int error = 0;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
if (shmd == NULL)
return (ENOMEM);
shmd->shm_sptas = shmd_arg->shm_sptas;
shmd->shm_amp = shm_amp;
shmd->shm_sptseg = shmd_arg->shm_sptseg;
(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
NULL, 0, seg->s_size);
seg->s_data = (void *)shmd;
seg->s_ops = &segspt_shmops;
seg->s_szc = shmd->shm_sptseg->s_szc;
sptd = shmd->shm_sptseg->s_data;
if (sptd->spt_flags & SHM_PAGEABLE) {
if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
KM_NOSLEEP)) == NULL) {
seg->s_data = (void *)NULL;
kmem_free(shmd, (sizeof (*shmd)));
return (ENOMEM);
}
shmd->shm_lckpgs = 0;
if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
seg->s_size, seg->s_szc)) != 0) {
kmem_free(shmd->shm_vpage,
btopr(shm_amp->size));
}
}
} else {
error = hat_share(seg->s_as->a_hat, seg->s_base,
shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
seg->s_size, seg->s_szc);
}
if (error) {
seg->s_szc = 0;
seg->s_data = (void *)NULL;
kmem_free(shmd, (sizeof (*shmd)));
} else {
ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
shm_amp->refcnt++;
ANON_LOCK_EXIT(&shm_amp->a_rwlock);
}
return (error);
}
int
segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
int reclaim = 1;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
retry:
if (shmd->shm_softlockcnt > 0) {
if (reclaim == 1) {
segspt_purge(seg);
reclaim = 0;
goto retry;
}
return (EAGAIN);
}
if (ssize != seg->s_size) {
#ifdef DEBUG
cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
ssize, seg->s_size);
#endif
return (EINVAL);
}
(void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
NULL, 0);
hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
seg_free(seg);
return (0);
}
void
segspt_shmfree(struct seg *seg)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct anon_map *shm_amp = shmd->shm_amp;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
(void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
MC_UNLOCK, NULL, 0);
/*
* Need to increment refcnt when attaching
* and decrement when detaching because of dup().
*/
ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
shm_amp->refcnt--;
ANON_LOCK_EXIT(&shm_amp->a_rwlock);
if (shmd->shm_vpage) { /* only for DISM */
kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
shmd->shm_vpage = NULL;
}
kmem_free(shmd, sizeof (*shmd));
}
/*ARGSUSED*/
int
segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
{
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* Shared page table is more than shared mapping.
* Individual process sharing page tables can't change prot
* because there is only one set of page tables.
* This will be allowed after private page table is
* supported.
*/
/* need to return correct status error? */
return (0);
}
faultcode_t
segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
size_t len, enum fault_type type, enum seg_rw rw)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg = shmd->shm_sptseg;
struct as *curspt = shmd->shm_sptas;
struct spt_data *sptd = sptseg->s_data;
pgcnt_t npages;
size_t share_sz, size;
caddr_t segspt_addr, shm_addr;
page_t **ppa;
int i;
ulong_t an_idx = 0;
int err = 0;
#ifdef lint
hat = hat;
#endif
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* Because of the way spt is implemented
* the realsize of the segment does not have to be
* equal to the segment size itself. The segment size is
* often in multiples of a page size larger than PAGESIZE.
* The realsize is rounded up to the nearest PAGESIZE
* based on what the user requested. This is a bit of
* ungliness that is historical but not easily fixed
* without re-designing the higher levels of ISM.
*/
ASSERT(addr >= seg->s_base);
if (((addr + len) - seg->s_base) > sptd->spt_realsize)
return (FC_NOMAP);
/*
* For all of the following cases except F_PROT, we need to
* make any necessary adjustments to addr and len
* and get all of the necessary page_t's into an array called ppa[].
*
* The code in shmat() forces base addr and len of ISM segment
* to be aligned to largest page size supported. Therefore,
* we are able to handle F_SOFTLOCK and F_INVAL calls in "large
* pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
* in large pagesize chunks, or else we will screw up the HAT
* layer by calling hat_memload_array() with differing page sizes
* over a given virtual range.
*/
share_sz = page_get_pagesize(sptseg->s_szc);
shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_sz);
npages = btopr(size);
/*
* Now we need to convert from addr in segshm to addr in segspt.
*/
an_idx = seg_page(seg, shm_addr);
segspt_addr = sptseg->s_base + ptob(an_idx);
ASSERT((segspt_addr + ptob(npages)) <=
(sptseg->s_base + sptd->spt_realsize));
ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
switch (type) {
case F_SOFTLOCK:
mutex_enter(&freemem_lock);
if (availrmem < tune.t_minarmem + npages) {
mutex_exit(&freemem_lock);
return (FC_MAKE_ERR(ENOMEM));
} else {
availrmem -= npages;
}
mutex_exit(&freemem_lock);
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
/*
* Fall through to the F_INVAL case to load up the hat layer
* entries with the HAT_LOAD_LOCK flag.
*/
/* FALLTHRU */
case F_INVAL:
if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
return (FC_NOMAP);
ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
if (err != 0) {
if (type == F_SOFTLOCK) {
mutex_enter(&freemem_lock);
availrmem += npages;
mutex_exit(&freemem_lock);
atomic_add_long((ulong_t *)(
&(shmd->shm_softlockcnt)), -npages);
}
goto dism_err;
}
AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
if (type == F_SOFTLOCK) {
/*
* Load up the translation keeping it
* locked and don't unlock the page.
*/
hat_memload_array(sptseg->s_as->a_hat, segspt_addr,
size, ppa, sptd->spt_prot,
HAT_LOAD_LOCK | HAT_LOAD_SHARE);
} else {
if (hat == seg->s_as->a_hat) {
/*
* Migrate pages marked for migration
*/
if (lgrp_optimizations())
page_migrate(seg, shm_addr, ppa,
npages);
/* CPU HAT */
hat_memload_array(sptseg->s_as->a_hat,
segspt_addr, size, ppa, sptd->spt_prot,
HAT_LOAD_SHARE);
} else {
/* XHAT. Pass real address */
hat_memload_array(hat, shm_addr,
size, ppa, sptd->spt_prot, HAT_LOAD_SHARE);
}
/*
* And now drop the SE_SHARED lock(s).
*/
for (i = 0; i < npages; i++)
page_unlock(ppa[i]);
}
if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
if (hat_share(seg->s_as->a_hat, shm_addr,
curspt->a_hat, segspt_addr, ptob(npages),
seg->s_szc) != 0) {
panic("hat_share err in DISM fault");
/* NOTREACHED */
}
}
AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
dism_err:
kmem_free(ppa, npages * sizeof (page_t *));
return (err);
case F_SOFTUNLOCK:
mutex_enter(&freemem_lock);
availrmem += npages;
mutex_exit(&freemem_lock);
/*
* This is a bit ugly, we pass in the real seg pointer,
* but the segspt_addr is the virtual address within the
* dummy seg.
*/
segspt_softunlock(seg, segspt_addr, size, rw);
return (0);
case F_PROT:
/*
* This takes care of the unusual case where a user
* allocates a stack in shared memory and a register
* window overflow is written to that stack page before
* it is otherwise modified.
*
* We can get away with this because ISM segments are
* always rw. Other than this unusual case, there
* should be no instances of protection violations.
*/
return (0);
default:
#ifdef DEBUG
panic("segspt_dismfault default type?");
#else
return (FC_NOMAP);
#endif
}
}
faultcode_t
segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
size_t len, enum fault_type type, enum seg_rw rw)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg = shmd->shm_sptseg;
struct as *curspt = shmd->shm_sptas;
struct spt_data *sptd = sptseg->s_data;
pgcnt_t npages;
size_t share_size, size;
caddr_t sptseg_addr, shm_addr;
page_t *pp, **ppa;
int i;
u_offset_t offset;
ulong_t anon_index = 0;
struct vnode *vp;
struct anon_map *amp; /* XXX - for locknest */
struct anon *ap = NULL;
anon_sync_obj_t cookie;
#ifdef lint
hat = hat;
#endif
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
if (sptd->spt_flags & SHM_PAGEABLE) {
return (segspt_dismfault(hat, seg, addr, len, type, rw));
}
/*
* Because of the way spt is implemented
* the realsize of the segment does not have to be
* equal to the segment size itself. The segment size is
* often in multiples of a page size larger than PAGESIZE.
* The realsize is rounded up to the nearest PAGESIZE
* based on what the user requested. This is a bit of
* ungliness that is historical but not easily fixed
* without re-designing the higher levels of ISM.
*/
ASSERT(addr >= seg->s_base);
if (((addr + len) - seg->s_base) > sptd->spt_realsize)
return (FC_NOMAP);
/*
* For all of the following cases except F_PROT, we need to
* make any necessary adjustments to addr and len
* and get all of the necessary page_t's into an array called ppa[].
*
* The code in shmat() forces base addr and len of ISM segment
* to be aligned to largest page size supported. Therefore,
* we are able to handle F_SOFTLOCK and F_INVAL calls in "large
* pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
* in large pagesize chunks, or else we will screw up the HAT
* layer by calling hat_memload_array() with differing page sizes
* over a given virtual range.
*/
share_size = page_get_pagesize(sptseg->s_szc);
shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_size);
npages = btopr(size);
/*
* Now we need to convert from addr in segshm to addr in segspt.
*/
anon_index = seg_page(seg, shm_addr);
sptseg_addr = sptseg->s_base + ptob(anon_index);
/*
* And now we may have to adjust npages downward if we have
* exceeded the realsize of the segment or initial anon
* allocations.
*/
if ((sptseg_addr + ptob(npages)) >
(sptseg->s_base + sptd->spt_realsize))
size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
npages = btopr(size);
ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
switch (type) {
case F_SOFTLOCK:
/*
* availrmem is decremented once during anon_swap_adjust()
* and is incremented during the anon_unresv(), which is
* called from shm_rm_amp() when the segment is destroyed.
*/
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
/*
* Some platforms assume that ISM pages are SE_SHARED
* locked for the entire life of the segment.
*/
if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
return (0);
/*
* Fall through to the F_INVAL case to load up the hat layer
* entries with the HAT_LOAD_LOCK flag.
*/
/* FALLTHRU */
case F_INVAL:
if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
return (FC_NOMAP);
/*
* Some platforms that do NOT support DYNAMIC_ISM_UNMAP
* may still rely on this call to hat_share(). That
* would imply that those hat's can fault on a
* HAT_LOAD_LOCK translation, which would seem
* contradictory.
*/
if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
if (hat_share(seg->s_as->a_hat, seg->s_base,
curspt->a_hat, sptseg->s_base,
sptseg->s_size, sptseg->s_szc) != 0) {
panic("hat_share error in ISM fault");
/*NOTREACHED*/
}
return (0);
}
ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
/*
* I see no need to lock the real seg,
* here, because all of our work will be on the underlying
* dummy seg.
*
* sptseg_addr and npages now account for large pages.
*/
amp = sptd->spt_amp;
ASSERT(amp != NULL);
anon_index = seg_page(sptseg, sptseg_addr);
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
for (i = 0; i < npages; i++) {
anon_array_enter(amp, anon_index, &cookie);
ap = anon_get_ptr(amp->ahp, anon_index++);
ASSERT(ap != NULL);
swap_xlate(ap, &vp, &offset);
anon_array_exit(&cookie);
pp = page_lookup(vp, offset, SE_SHARED);
ASSERT(pp != NULL);
ppa[i] = pp;
}
ANON_LOCK_EXIT(&amp->a_rwlock);
ASSERT(i == npages);
/*
* We are already holding the as->a_lock on the user's
* real segment, but we need to hold the a_lock on the
* underlying dummy as. This is mostly to satisfy the
* underlying HAT layer.
*/
AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
if (type == F_SOFTLOCK) {
/*
* Load up the translation keeping it
* locked and don't unlock the page.
*/
hat_memload_array(sptseg->s_as->a_hat, sptseg_addr,
ptob(npages), ppa, sptd->spt_prot,
HAT_LOAD_LOCK | HAT_LOAD_SHARE);
} else {
if (hat == seg->s_as->a_hat) {
/*
* Migrate pages marked for migration.
*/
if (lgrp_optimizations())
page_migrate(seg, shm_addr, ppa,
npages);
/* CPU HAT */
hat_memload_array(sptseg->s_as->a_hat,
sptseg_addr, ptob(npages), ppa,
sptd->spt_prot, HAT_LOAD_SHARE);
} else {
/* XHAT. Pass real address */
hat_memload_array(hat, shm_addr,
ptob(npages), ppa, sptd->spt_prot,
HAT_LOAD_SHARE);
}
/*
* And now drop the SE_SHARED lock(s).
*/
for (i = 0; i < npages; i++)
page_unlock(ppa[i]);
}
AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
kmem_free(ppa, sizeof (page_t *) * npages);
return (0);
case F_SOFTUNLOCK:
/*
* This is a bit ugly, we pass in the real seg pointer,
* but the sptseg_addr is the virtual address within the
* dummy seg.
*/
segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
return (0);
case F_PROT:
/*
* This takes care of the unusual case where a user
* allocates a stack in shared memory and a register
* window overflow is written to that stack page before
* it is otherwise modified.
*
* We can get away with this because ISM segments are
* always rw. Other than this unusual case, there
* should be no instances of protection violations.
*/
return (0);
default:
#ifdef DEBUG
cmn_err(CE_WARN, "segspt_shmfault default type?");
#endif
return (FC_NOMAP);
}
}
/*ARGSUSED*/
static faultcode_t
segspt_shmfaulta(struct seg *seg, caddr_t addr)
{
return (0);
}
/*ARGSUSED*/
static int
segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
{
return (0);
}
/*ARGSUSED*/
static size_t
segspt_shmswapout(struct seg *seg)
{
return (0);
}
/*
* duplicate the shared page tables
*/
int
segspt_shmdup(struct seg *seg, struct seg *newseg)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct anon_map *amp = shmd->shm_amp;
struct shm_data *shmd_new;
struct seg *spt_seg = shmd->shm_sptseg;
struct spt_data *sptd = spt_seg->s_data;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
newseg->s_data = (void *)shmd_new;
shmd_new->shm_sptas = shmd->shm_sptas;
shmd_new->shm_amp = amp;
shmd_new->shm_sptseg = shmd->shm_sptseg;
newseg->s_ops = &segspt_shmops;
newseg->s_szc = seg->s_szc;
ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
amp->refcnt++;
ANON_LOCK_EXIT(&amp->a_rwlock);
if (sptd->spt_flags & SHM_PAGEABLE) {
shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
shmd_new->shm_lckpgs = 0;
}
return (hat_share(newseg->s_as->a_hat, newseg->s_base,
shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc));
}
/*ARGSUSED*/
int
segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* ISM segment is always rw.
*/
return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
}
/*
* Return an array of locked large pages, for empty slots allocate
* private zero-filled anon pages.
*/
static int
spt_anon_getpages(
struct seg *sptseg,
caddr_t sptaddr,
size_t len,
page_t *ppa[])
{
struct spt_data *sptd = sptseg->s_data;
struct anon_map *amp = sptd->spt_amp;
enum seg_rw rw = sptd->spt_prot;
uint_t szc = sptseg->s_szc;
size_t pg_sz, share_sz = page_get_pagesize(szc);
pgcnt_t lp_npgs;
caddr_t lp_addr, e_sptaddr;
uint_t vpprot, ppa_szc = 0;
struct vpage *vpage = NULL;
ulong_t j, ppa_idx;
int err, ierr = 0;
pgcnt_t an_idx;
anon_sync_obj_t cookie;
ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
ASSERT(len != 0);
pg_sz = share_sz;
lp_npgs = btop(pg_sz);
lp_addr = sptaddr;
e_sptaddr = sptaddr + len;
an_idx = seg_page(sptseg, sptaddr);
ppa_idx = 0;
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
/*CONSTCOND*/
while (1) {
for (; lp_addr < e_sptaddr;
an_idx += lp_npgs, lp_addr += pg_sz,
ppa_idx += lp_npgs) {
anon_array_enter(amp, an_idx, &cookie);
ppa_szc = (uint_t)-1;
ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
&ppa_szc, vpage, rw, 0, segvn_anypgsz, kcred);
anon_array_exit(&cookie);
if (ierr != 0) {
if (ierr > 0) {
err = FC_MAKE_ERR(ierr);
goto lpgs_err;
}
break;
}
}
if (lp_addr == e_sptaddr) {
break;
}
ASSERT(lp_addr < e_sptaddr);
/*
* ierr == -1 means we failed to allocate a large page.
* so do a size down operation.
*
* ierr == -2 means some other process that privately shares
* pages with this process has allocated a larger page and we
* need to retry with larger pages. So do a size up
* operation. This relies on the fact that large pages are
* never partially shared i.e. if we share any constituent
* page of a large page with another process we must share the
* entire large page. Note this cannot happen for SOFTLOCK
* case, unless current address (lpaddr) is at the beginning
* of the next page size boundary because the other process
* couldn't have relocated locked pages.
*/
ASSERT(ierr == -1 || ierr == -2);
if (segvn_anypgsz) {
ASSERT(ierr == -2 || szc != 0);
ASSERT(ierr == -1 || szc < sptseg->s_szc);
szc = (ierr == -1) ? szc - 1 : szc + 1;
} else {
/*
* For faults and segvn_anypgsz == 0
* we need to be careful not to loop forever
* if existing page is found with szc other
* than 0 or seg->s_szc. This could be due
* to page relocations on behalf of DR or
* more likely large page creation. For this
* case simply re-size to existing page's szc
* if returned by anon_map_getpages().
*/
if (ppa_szc == (uint_t)-1) {
szc = (ierr == -1) ? 0 : sptseg->s_szc;
} else {
ASSERT(ppa_szc <= sptseg->s_szc);
ASSERT(ierr == -2 || ppa_szc < szc);
ASSERT(ierr == -1 || ppa_szc > szc);
szc = ppa_szc;
}
}
pg_sz = page_get_pagesize(szc);
lp_npgs = btop(pg_sz);
ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
}
ANON_LOCK_EXIT(&amp->a_rwlock);
return (0);
lpgs_err:
ANON_LOCK_EXIT(&amp->a_rwlock);
for (j = 0; j < ppa_idx; j++)
page_unlock(ppa[j]);
return (err);
}
int
spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
page_t **ppa, ulong_t *lockmap, size_t pos)
{
struct shm_data *shmd = seg->s_data;
struct spt_data *sptd = shmd->shm_sptseg->s_data;
ulong_t i;
int kernel;
for (i = 0; i < npages; anon_index++, pos++, i++) {
if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
if (sptd->spt_ppa_lckcnt[anon_index] <
(ushort_t)DISM_LOCK_MAX) {
if (++sptd->spt_ppa_lckcnt[anon_index] ==
(ushort_t)DISM_LOCK_MAX) {
cmn_err(CE_WARN,
"DISM page lock limit "
"reached on DISM offset 0x%lx\n",
anon_index << PAGESHIFT);
}
kernel = (sptd->spt_ppa &&
sptd->spt_ppa[anon_index]) ? 1 : 0;
if (!page_pp_lock(ppa[i], 0, kernel)) {
/* unlock rest of the pages */
for (; i < npages; i++)
page_unlock(ppa[i]);
sptd->spt_ppa_lckcnt[anon_index]--;
return (EAGAIN);
}
shmd->shm_lckpgs++;
shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
if (lockmap != NULL)
BT_SET(lockmap, pos);
}
}
page_unlock(ppa[i]);
}
return (0);
}
/*ARGSUSED*/
static int
segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
int attr, int op, ulong_t *lockmap, size_t pos)
{
struct shm_data *shmd = seg->s_data;
struct seg *sptseg = shmd->shm_sptseg;
struct spt_data *sptd = sptseg->s_data;
pgcnt_t npages, a_npages;
page_t **ppa;
pgcnt_t an_idx, a_an_idx, ppa_idx;
caddr_t spt_addr, a_addr; /* spt and aligned address */
size_t a_len; /* aligned len */
size_t share_sz;
ulong_t i;
int sts = 0;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
return (0);
}
addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
an_idx = seg_page(seg, addr);
npages = btopr(len);
if (an_idx + npages > btopr(shmd->shm_amp->size)) {
return (ENOMEM);
}
if (op == MC_LOCK) {
/*
* Need to align addr and size request if they are not
* aligned so we can always allocate large page(s) however
* we only lock what was requested in initial request.
*/
share_sz = page_get_pagesize(sptseg->s_szc);
a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
share_sz);
a_npages = btop(a_len);
a_an_idx = seg_page(seg, a_addr);
spt_addr = sptseg->s_base + ptob(a_an_idx);
ppa_idx = an_idx - a_an_idx;
if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
KM_NOSLEEP)) == NULL) {
return (ENOMEM);
}
/*
* Don't cache any new pages for IO and
* flush any cached pages.
*/
mutex_enter(&sptd->spt_lock);
if (sptd->spt_ppa != NULL)
sptd->spt_flags |= DISM_PPA_CHANGED;
sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
if (sts != 0) {
mutex_exit(&sptd->spt_lock);
kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
return (sts);
}
sts = spt_lockpages(seg, an_idx, npages,
&ppa[ppa_idx], lockmap, pos);
/*
* unlock remaining pages for requests which are not
* aligned or not in 4 M chunks
*/
for (i = 0; i < ppa_idx; i++)
page_unlock(ppa[i]);
for (i = ppa_idx + npages; i < a_npages; i++)
page_unlock(ppa[i]);
if (sptd->spt_ppa != NULL)
sptd->spt_flags |= DISM_PPA_CHANGED;
mutex_exit(&sptd->spt_lock);
kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
} else if (op == MC_UNLOCK) { /* unlock */
struct anon_map *amp;
struct anon *ap;
struct vnode *vp;
u_offset_t off;
struct page *pp;
int kernel;
anon_sync_obj_t cookie;
amp = sptd->spt_amp;
mutex_enter(&sptd->spt_lock);
if (shmd->shm_lckpgs == 0) {
mutex_exit(&sptd->spt_lock);
return (0);
}
/*
* Don't cache new IO pages.
*/
if (sptd->spt_ppa != NULL)
sptd->spt_flags |= DISM_PPA_CHANGED;
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
for (i = 0; i < npages; i++, an_idx++) {
if (shmd->shm_vpage[an_idx] & DISM_PG_LOCKED) {
anon_array_enter(amp, an_idx, &cookie);
ap = anon_get_ptr(amp->ahp, an_idx);
ASSERT(ap);
ASSERT(sptd->spt_ppa_lckcnt[an_idx] > 0);
swap_xlate(ap, &vp, &off);
anon_array_exit(&cookie);
pp = page_lookup(vp, off, SE_SHARED);
ASSERT(pp);
/*
* the availrmem is decremented only for
* pages which are not in seg pcache,
* for pages in seg pcache availrmem was
* decremented in _dismpagelock() (if
* they were not locked here)
*/
kernel = (sptd->spt_ppa &&
sptd->spt_ppa[an_idx]) ? 1 : 0;
page_pp_unlock(pp, 0, kernel);
page_unlock(pp);
shmd->shm_vpage[an_idx] &= ~DISM_PG_LOCKED;
sptd->spt_ppa_lckcnt[an_idx]--;
shmd->shm_lckpgs--;
}
}
ANON_LOCK_EXIT(&amp->a_rwlock);
if (sptd->spt_ppa != NULL)
sptd->spt_flags |= DISM_PPA_CHANGED;
mutex_exit(&sptd->spt_lock);
}
return (sts);
}
/*ARGSUSED*/
int
segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* ISM segment is always rw.
*/
while (--pgno >= 0)
*protv++ = sptd->spt_prot;
return (0);
}
/*ARGSUSED*/
u_offset_t
segspt_shmgetoffset(struct seg *seg, caddr_t addr)
{
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/* Offset does not matter in ISM memory */
return ((u_offset_t)0);
}
/* ARGSUSED */
int
segspt_shmgettype(struct seg *seg, caddr_t addr)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* The shared memory mapping is always MAP_SHARED, SWAP is only
* reserved for DISM
*/
return (MAP_SHARED |
((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
}
/*ARGSUSED*/
int
segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
*vpp = sptd->spt_vp;
return (0);
}
/*ARGSUSED*/
static int
segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
struct anon_map *amp;
pgcnt_t pg_idx;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
if (behav == MADV_FREE) {
if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
return (0);
amp = sptd->spt_amp;
pg_idx = seg_page(seg, addr);
mutex_enter(&sptd->spt_lock);
if (sptd->spt_ppa != NULL)
sptd->spt_flags |= DISM_PPA_CHANGED;
mutex_exit(&sptd->spt_lock);
/*
* Purge all DISM cached pages
*/
seg_ppurge_seg(segspt_reclaim);
mutex_enter(&sptd->spt_lock);
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
anon_disclaim(amp, pg_idx, len, ANON_PGLOOKUP_BLK);
ANON_LOCK_EXIT(&amp->a_rwlock);
mutex_exit(&sptd->spt_lock);
} else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
int already_set;
ulong_t anon_index;
lgrp_mem_policy_t policy;
caddr_t shm_addr;
size_t share_size;
size_t size;
struct seg *sptseg = shmd->shm_sptseg;
caddr_t sptseg_addr;
/*
* Align address and length to page size of underlying segment
*/
share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
share_size);
amp = shmd->shm_amp;
anon_index = seg_page(seg, shm_addr);
/*
* And now we may have to adjust size downward if we have
* exceeded the realsize of the segment or initial anon
* allocations.
*/
sptseg_addr = sptseg->s_base + ptob(anon_index);
if ((sptseg_addr + size) >
(sptseg->s_base + sptd->spt_realsize))
size = (sptseg->s_base + sptd->spt_realsize) -
sptseg_addr;
/*
* Set memory allocation policy for this segment
*/
policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
already_set = lgrp_shm_policy_set(policy, amp, anon_index,
NULL, 0, len);
/*
* If random memory allocation policy set already,
* don't bother reapplying it.
*/
if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
return (0);
/*
* Mark any existing pages in the given range for
* migration, flushing the I/O page cache, and using
* underlying segment to calculate anon index and get
* anonmap and vnode pointer from
*/
if (shmd->shm_softlockcnt > 0)
segspt_purge(seg);
page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
}
return (0);
}
/*ARGSUSED*/
void
segspt_shmdump(struct seg *seg)
{
/* no-op for ISM segment */
}
/*ARGSUSED*/
static faultcode_t
segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
{
return (ENOTSUP);
}
/*
* get a memory ID for an addr in a given segment
*/
static int
segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
{
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct anon *ap;
size_t anon_index;
struct anon_map *amp = shmd->shm_amp;
struct spt_data *sptd = shmd->shm_sptseg->s_data;
struct seg *sptseg = shmd->shm_sptseg;
anon_sync_obj_t cookie;
anon_index = seg_page(seg, addr);
if (addr > (seg->s_base + sptd->spt_realsize)) {
return (EFAULT);
}
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
anon_array_enter(amp, anon_index, &cookie);
ap = anon_get_ptr(amp->ahp, anon_index);
if (ap == NULL) {
struct page *pp;
caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
pp = anon_zero(sptseg, spt_addr, &ap, kcred);
if (pp == NULL) {
anon_array_exit(&cookie);
ANON_LOCK_EXIT(&amp->a_rwlock);
return (ENOMEM);
}
(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
page_unlock(pp);
}
anon_array_exit(&cookie);
ANON_LOCK_EXIT(&amp->a_rwlock);
memidp->val[0] = (uintptr_t)ap;
memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
return (0);
}
/*
* Get memory allocation policy info for specified address in given segment
*/
static lgrp_mem_policy_info_t *
segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
{
struct anon_map *amp;
ulong_t anon_index;
lgrp_mem_policy_info_t *policy_info;
struct shm_data *shm_data;
ASSERT(seg != NULL);
/*
* Get anon_map from segshm
*
* Assume that no lock needs to be held on anon_map, since
* it should be protected by its reference count which must be
* nonzero for an existing segment
* Need to grab readers lock on policy tree though
*/
shm_data = (struct shm_data *)seg->s_data;
if (shm_data == NULL)
return (NULL);
amp = shm_data->shm_amp;
ASSERT(amp->refcnt != 0);
/*
* Get policy info
*
* Assume starting anon index of 0
*/
anon_index = seg_page(seg, addr);
policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
return (policy_info);
}
/*ARGSUSED*/
static int
segspt_shmcapable(struct seg *seg, segcapability_t capability)
{
return (0);
}