vpm.c revision da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* VM - generic vnode page mapping interfaces.
*
* Mechanism to provide temporary mappings to vnode pages.
* The typical use would be to copy/access file data.
*/
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/buf.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mman.h>
#include <sys/errno.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/vtrace.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/thread.h>
#include <sys/dumphdr.h>
#include <sys/bitmap.h>
#include <sys/lgrp.h>
#include <vm/seg_kmem.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kpm.h>
#include <vm/seg_map.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/rm.h>
#include <vm/vpm.h>
/*
* Needs to be enabled by each platform.
*/
int vpm_enable = 0;
#ifdef SEGKPM_SUPPORT
int vpm_cache_enable = 1;
long vpm_cache_percent = 12;
long vpm_cache_size;
int vpm_nfreelist = 0;
int vpmd_freemsk = 0;
#define VPM_S_PAD 64
union vpm_cpu {
struct {
int vcpu_free_ndx;
ulong_t vcpu_hits;
ulong_t vcpu_misses;
} vcpu;
char vpm_pad[VPM_S_PAD];
};
static union vpm_cpu *vpmd_cpu;
#define vfree_ndx vcpu.vcpu_free_ndx
int vpm_cachemode = VPMCACHE_LRU;
#define PPMTX(pp) (&(pp)->p_ilock)
static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */
static struct vpmfree *vpmd_free;
#define VPMAPMTX(vpm) (&vpm->vpm_mtx)
#define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
#define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
#define VPMP(id) (&vpmd_vpmap[id - 1])
#define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1)
#ifdef DEBUG
struct vpm_debug {
int vpmd_steals;
int vpmd_contend;
int vpmd_prevpagelocked;
int vpmd_getpagefailed;
int vpmd_zerostart;
int vpmd_emptyfreelist;
int vpmd_nofreevpms;
} vpm_debug;
#define VPM_DEBUG(x) ((vpm_debug.x)++)
int steals;
int steals_mtbf = 7;
int contend;
int contend_mtbf = 127;
#define VPM_MTBF(v, f) (((++(v)) & (f)) != (f))
#else /* DEBUG */
#define VPM_MTBF(v, f) (1)
#define VPM_DEBUG(x) /* nothing */
#endif
/*
* The vpm cache.
*
* The main purpose of having a cache here is to speed up page_lookup()
* operations and also provide an LRU(default) behaviour of file pages. The
* page_lookup() operation tends to be expensive if a page has to be
* reclaimed from the system page cache("cachelist"). Once we speed up the
* page_lookup()->page_reclaim() path then there there should be no need for
* this cache. The system page cache(cachelist) should effectively serve the
* purpose of caching file pages.
*
* This cache is very similar to segmap's smap cache. Each page in the
* cache is tracked by the structure vpmap_t. But unlike segmap, there is no
* hash table. The page_t has a reference to the vpmap_t when cached. For a
* given vnode, offset the page is found by means of a page_lookup() operation.
* Any page which has a mapping(i.e when cached) will not be in the
* system 'cachelist'. Hence the page_lookup() will not have to do a
* page_reclaim(). That is how the cache serves to speed up page_lookup()
* operations.
*
* This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
*/
void
vpm_init()
{
long npages;
struct vpmap *vpm;
struct vpmfree *vpmflp;
int i, ndx;
extern void prefetch_smap_w(void *);
if (!vpm_cache_enable) {
return;
}
/*
* Set the size of the cache.
*/
vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
if (vpm_cache_size < VPMAP_MINCACHE) {
vpm_cache_size = VPMAP_MINCACHE;
}
/*
* Number of freelists.
*/
if (vpm_nfreelist == 0) {
vpm_nfreelist = max_ncpus;
} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
cmn_err(CE_WARN, "vpmap create : number of freelist "
"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
vpm_nfreelist = 2 * max_ncpus;
}
/*
* Round it up to the next power of 2
*/
if (vpm_nfreelist & (vpm_nfreelist - 1)) {
vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
}
vpmd_freemsk = vpm_nfreelist - 1;
/*
* Use a per cpu rotor index to spread the allocations evenly
* across the available vpm freelists.
*/
vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
ndx = 0;
for (i = 0; i < max_ncpus; i++) {
vpmd_cpu[i].vfree_ndx = ndx;
ndx = (ndx + 1) & vpmd_freemsk;
}
/*
* Allocate and initialize the freelist.
*/
vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
KM_SLEEP);
for (i = 0; i < vpm_nfreelist; i++) {
vpmflp = &vpmd_free[i];
/*
* Set up initial queue pointers. They will get flipped
* back and forth.
*/
vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
}
npages = mmu_btop(vpm_cache_size);
/*
* Allocate and initialize the vpmap structs.
*/
vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
struct vpmfree *vpmflp;
union vpm_freeq *releq;
struct vpmap *vpmapf;
/*
* Use prefetch as we have to walk thru a large number of
* these data structures. We just use the smap's prefetch
* routine as it does the same. This should work fine
* for x64(this needs to be modified when enabled on sparc).
*/
prefetch_smap_w((void *)vpm);
vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
vpmflp = VPMAP2VMF(vpm);
releq = vpmflp->vpm_releq;
vpmapf = releq->vpmq_free;
if (vpmapf == NULL) {
releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
} else {
vpm->vpm_next = vpmapf;
vpm->vpm_prev = vpmapf->vpm_prev;
vpmapf->vpm_prev = vpm;
vpm->vpm_prev->vpm_next = vpm;
releq->vpmq_free = vpm->vpm_next;
}
/*
* Indicate that the vpmap is on the releq at start
*/
vpm->vpm_ndxflg = VPMRELEQ;
}
}
/*
* unhooks vpm from the freelist if it is still on the freelist.
*/
#define VPMAP_RMFREELIST(vpm) \
{ \
if (vpm->vpm_next != NULL) { \
union vpm_freeq *freeq; \
struct vpmfree *vpmflp; \
vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
mutex_enter(&freeq->vpmq_mtx); \
if (freeq->vpmq_free != vpm) { \
vpm->vpm_prev->vpm_next = vpm->vpm_next; \
vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
} else if (vpm == vpm->vpm_next) { \
freeq->vpmq_free = NULL; \
} else { \
freeq->vpmq_free = vpm->vpm_next; \
vpm->vpm_prev->vpm_next = vpm->vpm_next; \
vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
} \
mutex_exit(&freeq->vpmq_mtx); \
vpm->vpm_next = vpm->vpm_prev = NULL; \
} \
}
static int
get_freelndx(int mode)
{
int ndx;
ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
switch (mode) {
case VPMCACHE_LRU:
default:
vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
break;
}
return (ndx);
}
/*
* Find one vpmap structure from the free lists and use it for the newpage.
* The previous page it cached is dissociated and released. The page_t's
* p_vpmref is cleared only when the vpm it is pointing to is locked(or
* for AMD64 when the page is exclusively locked in page_unload. That is
* because the p_vpmref is treated as mapping).
*
* The page's p_vpmref is set when the page is
* locked(at least SHARED locked).
*/
static struct vpmap *
get_free_vpmap(page_t *newpage)
{
struct vpmfree *vpmflp;
kmutex_t *vmtx;
struct vpmap *vpm, *first;
union vpm_freeq *allocq, *releq;
page_t *pp = NULL;
int end_ndx, page_locked = 0;
int free_ndx;
/*
* get the freelist bin index.
*/
free_ndx = get_freelndx(vpm_cachemode);
end_ndx = free_ndx;
vpmflp = &vpmd_free[free_ndx];
retry_queue:
allocq = vpmflp->vpm_allocq;
mutex_enter(&allocq->vpmq_mtx);
if ((vpm = allocq->vpmq_free) == NULL) {
skip_queue:
/*
* The alloc list is empty or this queue is being skipped;
* first see if the allocq toggled.
*/
if (vpmflp->vpm_allocq != allocq) {
/* queue changed */
mutex_exit(&allocq->vpmq_mtx);
goto retry_queue;
}
releq = vpmflp->vpm_releq;
if (!mutex_tryenter(&releq->vpmq_mtx)) {
/* cannot get releq; a free vpmap may be there now */
mutex_exit(&allocq->vpmq_mtx);
/*
* This loop could spin forever if this thread has
* higher priority than the thread that is holding
* releq->vpmq_mtx. In order to force the other thread
* to run, we'll lock/unlock the mutex which is safe
* since we just unlocked the allocq mutex.
*/
mutex_enter(&releq->vpmq_mtx);
mutex_exit(&releq->vpmq_mtx);
goto retry_queue;
}
if (releq->vpmq_free == NULL) {
VPM_DEBUG(vpmd_emptyfreelist);
/*
* This freelist is empty.
* This should not happen unless clients
* are failing to release the vpmap after
* accessing the data. Before resorting
* to sleeping, try the next list of the same color.
*/
free_ndx = (free_ndx + 1) & vpmd_freemsk;
if (free_ndx != end_ndx) {
mutex_exit(&releq->vpmq_mtx);
mutex_exit(&allocq->vpmq_mtx);
vpmflp = &vpmd_free[free_ndx];
goto retry_queue;
}
/*
* Tried all freelists.
* wait on this list and hope something gets freed.
*/
vpmflp->vpm_want++;
mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
cv_wait(&vpmflp->vpm_free_cv,
&vpmflp->vpm_freeq[0].vpmq_mtx);
vpmflp->vpm_want--;
mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
vpmflp = &vpmd_free[free_ndx];
VPM_DEBUG(vpmd_nofreevpms);
goto retry_queue;
} else {
/*
* Something on the rele queue; flip the alloc
* and rele queues and retry.
*/
vpmflp->vpm_allocq = releq;
vpmflp->vpm_releq = allocq;
mutex_exit(&allocq->vpmq_mtx);
mutex_exit(&releq->vpmq_mtx);
if (page_locked) {
delay(hz >> 2);
page_locked = 0;
}
goto retry_queue;
}
} else {
int gotnewvpm;
kmutex_t *pmtx;
uint_t vpmref;
/*
* Fastpath the case we get the vpmap mutex
* on the first try.
*/
first = vpm;
next_vpmap:
vmtx = VPMAPMTX(vpm);
if (!mutex_tryenter(vmtx)) {
/*
* Another thread is trying to reclaim this slot.
* Skip to the next queue or vpmap.
*/
if ((vpm = vpm->vpm_next) == first) {
goto skip_queue;
} else {
goto next_vpmap;
}
}
/*
* Assign this vpm to the newpage.
*/
pmtx = PPMTX(newpage);
gotnewvpm = 0;
mutex_enter(pmtx);
/*
* Check if some other thread already assigned a vpm to
* this page.
*/
if ((vpmref = newpage->p_vpmref) == 0) {
newpage->p_vpmref = VPMID(vpm);
gotnewvpm = 1;
} else {
VPM_DEBUG(vpmd_contend);
mutex_exit(vmtx);
}
mutex_exit(pmtx);
if (gotnewvpm) {
/*
* At this point, we've selected the vpm. Remove vpm
* from its freelist. If vpm is the first one in
* the freelist, update the head of the freelist.
*/
if (first == vpm) {
ASSERT(first == allocq->vpmq_free);
allocq->vpmq_free = vpm->vpm_next;
}
/*
* If the head of the freelist still points to vpm,
* then there are no more free vpmaps in that list.
*/
if (allocq->vpmq_free == vpm)
/*
* Took the last one
*/
allocq->vpmq_free = NULL;
else {
vpm->vpm_prev->vpm_next = vpm->vpm_next;
vpm->vpm_next->vpm_prev = vpm->vpm_prev;
}
mutex_exit(&allocq->vpmq_mtx);
vpm->vpm_prev = vpm->vpm_next = NULL;
/*
* Disassociate the previous page. On x64 systems
* p_vpmref is used as a mapping reference to the page.
*/
if ((pp = vpm->vpm_pp) != NULL &&
vpm->vpm_vp == pp->p_vnode &&
vpm->vpm_off == pp->p_offset) {
pmtx = PPMTX(pp);
if (page_trylock(pp, SE_SHARED)) {
/*
* Now verify that it is the correct
* page. If not someone else stole it,
* so just unlock it and leave.
*/
mutex_enter(pmtx);
if (PP_ISFREE(pp) ||
vpm->vpm_vp != pp->p_vnode ||
vpm->vpm_off != pp->p_offset ||
pp->p_vpmref != VPMID(vpm)) {
mutex_exit(pmtx);
page_unlock(pp);
} else {
/*
* Release the page.
*/
pp->p_vpmref = 0;
mutex_exit(pmtx);
hat_kpm_mapout(pp, 0,
hat_kpm_page2va(pp, 1));
(void) page_release(pp, 1);
}
} else {
/*
* If the page cannot be locked, just
* clear the p_vpmref and go.
*/
mutex_enter(pmtx);
if (pp->p_vpmref == VPMID(vpm)) {
pp->p_vpmref = 0;
}
mutex_exit(pmtx);
VPM_DEBUG(vpmd_prevpagelocked);
}
}
/*
* Setup vpm to point to the new page.
*/
vpm->vpm_pp = newpage;
vpm->vpm_vp = newpage->p_vnode;
vpm->vpm_off = newpage->p_offset;
} else {
int steal = !VPM_MTBF(steals, steals_mtbf);
/*
* Page already has a vpm assigned just use that.
* Grab the vpm mutex and verify that it is still
* the correct one. The pp->p_vpmref should not change
* once we have the vpm mutex and the page lock.
*/
mutex_exit(&allocq->vpmq_mtx);
vpm = VPMP(vpmref);
vmtx = VPMAPMTX(vpm);
mutex_enter(vmtx);
if ((steal && vpm->vpm_refcnt == 0) ||
vpm->vpm_pp != newpage) {
/*
* The vpm got stolen, retry.
* clear the p_vpmref.
*/
pmtx = PPMTX(newpage);
mutex_enter(pmtx);
if (newpage->p_vpmref == vpmref) {
newpage->p_vpmref = 0;
}
mutex_exit(pmtx);
mutex_exit(vmtx);
VPM_DEBUG(vpmd_steals);
goto retry_queue;
} else if (vpm->vpm_refcnt == 0) {
/*
* Remove it from the free list if it
* exists there.
*/
VPMAP_RMFREELIST(vpm);
}
}
return (vpm);
}
}
static void
free_vpmap(struct vpmap *vpm)
{
struct vpmfree *vpmflp;
struct vpmap *vpmfreelist;
union vpm_freeq *releq;
ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
if (vpm->vpm_refcnt != 0) {
panic("free_vpmap");
/*NOTREACHED*/
}
vpmflp = &vpmd_free[vpm->vpm_free_ndx];
/*
* Add to the tail of the release queue
* Note that vpm_releq and vpm_allocq could toggle
* before we get the lock. This does not affect
* correctness as the 2 queues are only maintained
* to reduce lock pressure.
*/
releq = vpmflp->vpm_releq;
if (releq == &vpmflp->vpm_freeq[0]) {
vpm->vpm_ndxflg = 0;
} else {
vpm->vpm_ndxflg = 1;
}
mutex_enter(&releq->vpmq_mtx);
vpmfreelist = releq->vpmq_free;
if (vpmfreelist == 0) {
int want;
releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
/*
* Both queue mutexes are held to set vpm_want;
* snapshot the value before dropping releq mutex.
* If vpm_want appears after the releq mutex is dropped,
* then the vpmap just freed is already gone.
*/
want = vpmflp->vpm_want;
mutex_exit(&releq->vpmq_mtx);
/*
* See if there was a waiter before dropping the releq mutex
* then recheck after obtaining vpm_freeq[0] mutex as
* the another thread may have already signaled.
*/
if (want) {
mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
if (vpmflp->vpm_want)
cv_signal(&vpmflp->vpm_free_cv);
mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
}
} else {
vpm->vpm_next = vpmfreelist;
vpm->vpm_prev = vpmfreelist->vpm_prev;
vpmfreelist->vpm_prev = vpm;
vpm->vpm_prev->vpm_next = vpm;
mutex_exit(&releq->vpmq_mtx);
}
}
/*
* Get the vpmap for the page.
* The refcnt of this vpm is incremented.
*/
static struct vpmap *
get_vpmap(page_t *pp)
{
struct vpmap *vpm = NULL;
kmutex_t *vmtx;
kmutex_t *pmtx;
unsigned int refid;
ASSERT((pp != NULL) && PAGE_LOCKED(pp));
if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
vpm = VPMP(refid);
vmtx = VPMAPMTX(vpm);
mutex_enter(vmtx);
/*
* Since we have the page lock and the vpm mutex, the
* pp->p_vpmref cannot change.
*/
if (vpm->vpm_pp != pp) {
pmtx = PPMTX(pp);
/*
* Clear the p_vpmref as it is incorrect.
* This can happen if the page was stolen.
* On x64 this should not happen as p_vpmref
* is treated as a mapping on the page. So
* if the page is stolen, the mapping would have
* been cleared in page_unload().
*/
mutex_enter(pmtx);
if (pp->p_vpmref == refid)
pp->p_vpmref = 0;
mutex_exit(pmtx);
mutex_exit(vmtx);
vpm = NULL;
} else if (vpm->vpm_refcnt == 0) {
/*
* Got the vpm, remove it from the free
* list if it exists there.
*/
VPMAP_RMFREELIST(vpm);
}
}
if (vpm == NULL) {
/*
* get_free_vpmap() returns with the vpmap mutex held.
*/
vpm = get_free_vpmap(pp);
vmtx = VPMAPMTX(vpm);
vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
} else {
vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
}
vpm->vpm_refcnt++;
mutex_exit(vmtx);
return (vpm);
}
/* END --- vpm cache ---- */
/*
* The vnode page mapping(vpm) interface routines.
*/
/*
* Find or create the pages starting form baseoff for specified
* length 'len'.
*/
static int
vpm_pagecreate(
struct vnode *vp,
u_offset_t baseoff,
size_t len,
vmap_t vml[],
int nseg,
int *newpage)
{
page_t *pp = NULL;
caddr_t base;
u_offset_t off = baseoff;
int i;
ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
for (i = 0; len > 0; len -= PAGESIZE, i++) {
struct vpmap *vpm;
if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
base = segkpm_create_va(off);
/*
* the seg pointer passed in is just advisor. Just
* pass segkmap for now like segmap does with
* segmap_kpm enabled.
*/
if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
segkmap, base)) == NULL) {
panic("segmap_pagecreate_vpm: "
"page_create failed");
/*NOTREACHED*/
}
if (newpage != NULL)
*newpage = 1;
page_io_unlock(pp);
}
/*
* Get the vpm for this page_t.
*/
if (vpm_cache_enable) {
vpm = get_vpmap(pp);
vml[i].vs_data = (void *)&vpm->vpm_pp;
} else {
vml[i].vs_data = (void *)pp;
pp->p_vpmref = 0;
}
vml[i].vs_addr = hat_kpm_mapin(pp, 0);
vml[i].vs_len = PAGESIZE;
off += PAGESIZE;
}
vml[i].vs_data = NULL;
vml[i].vs_addr = (caddr_t)NULL;
return (0);
}
/*
* Returns vpm mappings of pages in the range [off, off+len], where
* len is rounded up to the PAGESIZE boundary. The list of pages and
* the page addresses are returned in the SGL vml (vmap_t) array passed in.
* The nseg is the number of vmap_t entries in the array.
*
* Currently max len allowed is MAXBSIZE therefore, it will either
* fetch/create one or two pages depending on what is the PAGESIZE.
*
* The segmap's SM_LOCKPROTO usage is not supported by these interfaces.
* For such cases, use the seg_map interfaces.
*/
int
vpm_map_pages(
struct vnode *vp,
u_offset_t off,
size_t len,
int fetchpage,
vmap_t *vml,
int nseg,
int *newpage,
enum seg_rw rw)
{
extern struct vnode *common_specvp();
u_offset_t baseoff;
uint_t prot;
caddr_t base;
page_t *pp, *pplist[MAXVMAPS];
struct vpmap *vpm;
int i, error = 0;
ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
baseoff = off & (offset_t)PAGEMASK;
vml[0].vs_data = NULL;
vml[0].vs_addr = (caddr_t)NULL;
/*
* For now, lets restrict it to MAXBSIZE. XXX - We can allow
* len longer then MAXBSIZE, but there should be a limit
* which should be determined by how many pages the VOP_GETPAGE()
* can fetch.
*/
if (off + len > baseoff + MAXBSIZE) {
panic("vpm_map_pages bad len");
/*NOTREACHED*/
}
/*
* If this is a block device we have to be sure to use the
* "common" block device vnode for the mapping.
*/
if (vp->v_type == VBLK)
vp = common_specvp(vp);
/*
* round up len to a multiple of PAGESIZE.
*/
len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK);
if (!fetchpage)
return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
pp = page_lookup(vp, baseoff, SE_SHARED);
/*
* If we did not find the page or if this page was not
* in our cache, then let VOP_GETPAGE get all the pages.
* We need to call VOP_GETPAGE so that filesytems can do some
* (un)necessary tracking for sequential access.
*/
if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
!= (P_MOD | P_REF))) {
if (pp != NULL) {
page_unlock(pp);
}
/*
* Pass a dummy address as it will be required
* by page_create_va(). We pass segkmap as the seg
* as some file systems(UFS) check it.
*/
base = segkpm_create_va(baseoff);
error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
len, segkmap, base, rw, CRED(), NULL);
if (error) {
VPM_DEBUG(vpmd_getpagefailed);
pplist[i] = NULL;
}
break;
} else {
pplist[i] = pp;
baseoff += PAGESIZE;
}
}
if (error) {
for (i = 0; pplist[i] != NULL; i++) {
page_unlock(pplist[i]);
pplist[i] = NULL;
}
vml[0].vs_addr = NULL;
vml[0].vs_data = NULL;
return (error);
}
/*
* Get the vpm's for pages.
*/
for (i = 0; pplist[i] != NULL; i++) {
if (vpm_cache_enable) {
vpm = get_vpmap(pplist[i]);
vml[i].vs_data = (void *)&(vpm->vpm_pp);
} else {
vml[i].vs_data = (void *)pplist[i];
pplist[i]->p_vpmref = 0;
}
vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
vml[i].vs_len = PAGESIZE;
}
vml[i].vs_data = NULL;
vml[i].vs_addr = (caddr_t)NULL;
return (0);
}
/*
* Release the vpm mappings on the pages and unlock them.
*/
void
vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
{
int i;
struct vpmap *vpm;
kmutex_t *mtx;
page_t *pp;
for (i = 0; vml[i].vs_data != NULL; i++) {
ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
if (vpm_cache_enable) {
pp = *(((page_t **)vml[i].vs_data));
} else {
pp = (page_t *)vml[i].vs_data;
}
/*
* Mark page as being modified or referenced, bacause vpm pages
* would not cause faults where it would be set normally.
*/
if (rw == S_WRITE) {
hat_setrefmod(pp);
} else {
ASSERT(rw == S_READ);
hat_setref(pp);
}
if (vpm_cache_enable) {
page_unlock(pp);
vpm = (struct vpmap *)((char *)vml[i].vs_data
- offsetof(struct vpmap, vpm_pp));
mtx = VPMAPMTX(vpm);
mutex_enter(mtx);
if (--vpm->vpm_refcnt == 0) {
free_vpmap(vpm);
}
mutex_exit(mtx);
} else {
hat_kpm_mapout(pp, 0, vml[i].vs_addr);
(void) page_release(pp, 1);
}
vml[i].vs_data = NULL;
vml[i].vs_addr = NULL;
}
}
/*
* Given the vp, off and the uio structure, this routine will do the
* the copy (uiomove). If the last page created is partially written,
* the rest of the page is zeroed out. It also zeros the beginning of
* the first page till the start offset if requested(zerostart).
* If pages are to be fetched, it will call the filesystem's getpage
* function (VOP_GETPAGE) to get them, otherwise they will be created if
* not already present in the page cache.
*/
int
vpm_data_copy(struct vnode *vp,
u_offset_t off,
size_t len,
struct uio *uio,
int fetchpage,
int *newpage,
int zerostart,
enum seg_rw rw)
{
int error;
struct vmap vml[MINVMAPS];
enum uio_rw uiorw;
int npages = 0;
uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
/*
* 'off' will be the offset where the I/O starts.
* We get the pages starting at the (off & PAGEMASK)
* page boundary.
*/
error = vpm_map_pages(vp, off, (uint_t)len,
fetchpage, vml, MINVMAPS, &npages, rw);
if (newpage != NULL)
*newpage = npages;
if (!error) {
int i, pn, slen = len;
int pon = off & PAGEOFFSET;
/*
* Clear from the beginning of the page to start offset
* if requested.
*/
if (!fetchpage && zerostart) {
(void) kzero(vml[0].vs_addr, (uint_t)pon);
VPM_DEBUG(vpmd_zerostart);
}
for (i = 0; !error && slen > 0 &&
vml[i].vs_addr != NULL; i++) {
pn = (int)MIN(slen, (PAGESIZE - pon));
error = uiomove(vml[i].vs_addr + pon,
(long)pn, uiorw, uio);
slen -= pn;
pon = 0;
}
/*
* When new pages are created, zero out part of the
* page we did not copy to.
*/
if (!fetchpage && npages &&
uio->uio_loffset < roundup(off + len, PAGESIZE)) {
int nzero;
pon = (uio->uio_loffset & PAGEOFFSET);
nzero = PAGESIZE - pon;
i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
}
vpm_unmap_pages(vml, rw);
}
return (error);
}
/*
* called to flush pages for the given vnode covering
* [off, off+len] range.
*/
int
vpm_sync_pages(struct vnode *vp,
u_offset_t off,
size_t len,
uint_t flags)
{
extern struct vnode *common_specvp();
int bflags = 0;
int error = 0;
size_t psize = roundup(len, PAGESIZE);
/*
* If this is a block device we have to be sure to use the
* "common" block device vnode for the mapping.
*/
if (vp->v_type == VBLK)
vp = common_specvp(vp);
if ((flags & ~SM_DONTNEED) != 0) {
if (flags & SM_ASYNC)
bflags |= B_ASYNC;
if (flags & SM_INVAL)
bflags |= B_INVAL;
if (flags & SM_DESTROY)
bflags |= (B_INVAL|B_TRUNC);
if (flags & SM_FREE)
bflags |= B_FREE;
if (flags & SM_DONTNEED)
bflags |= B_DONTNEED;
error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
}
return (error);
}
#else /* SEGKPM_SUPPORT */
/* vpm stubs */
void
vpm_init()
{
}
/*ARGSUSED*/
int
vpm_pagecreate(
struct vnode *vp,
u_offset_t baseoff,
size_t len,
vmap_t vml[],
int nseg,
int *newpage)
{
return (0);
}
/*ARGSUSED*/
int
vpm_map_pages(
struct vnode *vp,
u_offset_t off,
size_t len,
int fetchpage,
vmap_t vml[],
int nseg,
int *newpage,
enum seg_rw rw)
{
return (0);
}
/*ARGSUSED*/
int
vpm_data_copy(struct vnode *vp,
u_offset_t off,
size_t len,
struct uio *uio,
int fetchpage,
int *newpage,
int zerostart,
enum seg_rw rw)
{
return (0);
}
/*ARGSUSED*/
void
vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
{
}
/*ARGSUSED*/
int
vpm_sync_pages(struct vnode *vp,
u_offset_t off,
size_t len,
uint_t flags)
{
return (0);
}
#endif /* SEGKPM_SUPPORT */