/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* VM - paged vnode.
*
* This file supplies vm support for the vnode operations that deal with pages.
*/
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/vmsystm.h>
#include <sys/mman.h>
#include <sys/vfs.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/cpuvar.h>
#include <sys/vtrace.h>
#include <sys/tnf_probe.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/rm.h>
#include <vm/pvn.h>
#include <vm/page.h>
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <sys/fs/swapnode.h>
int pvn_nofodklust = 0;
int pvn_write_noklust = 0;
uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */
uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */
/* support for vmodsort for testing */
static struct kmem_cache *marker_cache = NULL;
/*
* Find the largest contiguous block which contains `addr' for file offset
* `offset' in it while living within the file system block sizes (`vp_off'
* and `vp_len') and the address space limits for which no pages currently
* exist and which map to consecutive file offsets.
*/
page_t *
pvn_read_kluster(
struct vnode *vp,
u_offset_t off,
struct seg *seg,
caddr_t addr,
u_offset_t *offp, /* return values */
size_t *lenp, /* return values */
u_offset_t vp_off,
size_t vp_len,
int isra)
{
ssize_t deltaf, deltab;
page_t *pp;
page_t *plist = NULL;
spgcnt_t pagesavail;
u_offset_t vp_end;
ASSERT(off >= vp_off && off < vp_off + vp_len);
/*
* We only want to do klustering/read ahead if there
* is more than minfree pages currently available.
*/
pagesavail = freemem - minfree;
if (pagesavail <= 0)
if (isra)
return ((page_t *)NULL); /* ra case - give up */
else
pagesavail = 1; /* must return a page */
/* We calculate in pages instead of bytes due to 32-bit overflows */
if (pagesavail < (spgcnt_t)btopr(vp_len)) {
/*
* Don't have enough free memory for the
* max request, try sizing down vp request.
*/
deltab = (ssize_t)(off - vp_off);
vp_len -= deltab;
vp_off += deltab;
if (pagesavail < btopr(vp_len)) {
/*
* Still not enough memory, just settle for
* pagesavail which is at least 1.
*/
vp_len = ptob(pagesavail);
}
}
vp_end = vp_off + vp_len;
ASSERT(off >= vp_off && off < vp_end);
if (isra && SEGOP_KLUSTER(seg, addr, 0))
return ((page_t *)NULL); /* segment driver says no */
if ((plist = page_create_va(vp, off,
PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
return ((page_t *)NULL);
if (vp_len <= PAGESIZE || pvn_nofodklust) {
*offp = off;
*lenp = MIN(vp_len, PAGESIZE);
} else {
/*
* Scan back from front by incrementing "deltab" and
* comparing "off" with "vp_off + deltab" to avoid
* "signed" versus "unsigned" conversion problems.
*/
for (deltab = PAGESIZE; off >= vp_off + deltab;
deltab += PAGESIZE) {
/*
* Call back to the segment driver to verify that
* the klustering/read ahead operation makes sense.
*/
if (SEGOP_KLUSTER(seg, addr, -deltab))
break; /* page not eligible */
if ((pp = page_create_va(vp, off - deltab,
PAGESIZE, PG_EXCL, seg, addr - deltab))
== NULL)
break; /* already have the page */
/*
* Add page to front of page list.
*/
page_add(&plist, pp);
}
deltab -= PAGESIZE;
/* scan forward from front */
for (deltaf = PAGESIZE; off + deltaf < vp_end;
deltaf += PAGESIZE) {
/*
* Call back to the segment driver to verify that
* the klustering/read ahead operation makes sense.
*/
if (SEGOP_KLUSTER(seg, addr, deltaf))
break; /* page not file extension */
if ((pp = page_create_va(vp, off + deltaf,
PAGESIZE, PG_EXCL, seg, addr + deltaf))
== NULL)
break; /* already have page */
/*
* Add page to end of page list.
*/
page_add(&plist, pp);
plist = plist->p_next;
}
*offp = off = off - deltab;
*lenp = deltab + deltaf;
ASSERT(off >= vp_off);
/*
* If we ended up getting more than was actually
* requested, retract the returned length to only
* reflect what was requested. This might happen
* if we were allowed to kluster pages across a
* span of (say) 5 frags, and frag size is less
* than PAGESIZE. We need a whole number of
* pages to contain those frags, but the returned
* size should only allow the returned range to
* extend as far as the end of the frags.
*/
if ((vp_off + vp_len) < (off + *lenp)) {
ASSERT(vp_end > off);
*lenp = vp_end - off;
}
}
TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
"pvn_read_kluster:seg %p addr %x isra %x",
seg, addr, isra);
return (plist);
}
/*
* Handle pages for this vnode on either side of the page "pp"
* which has been locked by the caller. This routine will also
* do klustering in the range [vp_off, vp_off + vp_len] up
* until a page which is not found. The offset and length
* of pages included is returned in "*offp" and "*lenp".
*
* Returns a list of dirty locked pages all ready to be
* written back.
*/
page_t *
pvn_write_kluster(
struct vnode *vp,
page_t *pp,
u_offset_t *offp, /* return values */
size_t *lenp, /* return values */
u_offset_t vp_off,
size_t vp_len,
int flags)
{
u_offset_t off;
page_t *dirty;
size_t deltab, deltaf;
se_t se;
u_offset_t vp_end;
off = pp->p_offset;
/*
* Kustering should not be done if we are invalidating
* pages since we could destroy pages that belong to
* some other process if this is a swap vnode.
*/
if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
*offp = off;
*lenp = PAGESIZE;
return (pp);
}
if (flags & (B_FREE | B_INVAL))
se = SE_EXCL;
else
se = SE_SHARED;
dirty = pp;
/*
* Scan backwards looking for pages to kluster by incrementing
* "deltab" and comparing "off" with "vp_off + deltab" to
* avoid "signed" versus "unsigned" conversion problems.
*/
for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
pp = page_lookup_nowait(vp, off - deltab, se);
if (pp == NULL)
break; /* page not found */
if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
break;
page_add(&dirty, pp);
}
deltab -= PAGESIZE;
vp_end = vp_off + vp_len;
/* now scan forwards looking for pages to kluster */
for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
pp = page_lookup_nowait(vp, off + deltaf, se);
if (pp == NULL)
break; /* page not found */
if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
break;
page_add(&dirty, pp);
dirty = dirty->p_next;
}
*offp = off - deltab;
*lenp = deltab + deltaf;
return (dirty);
}
/*
* Generic entry point used to release the "shared/exclusive" lock
* and the "p_iolock" on pages after i/o is complete.
*/
void
pvn_io_done(page_t *plist)
{
page_t *pp;
while (plist != NULL) {
pp = plist;
page_sub(&plist, pp);
page_io_unlock(pp);
page_unlock(pp);
}
}
/*
* Entry point to be used by file system getpage subr's and
* other such routines which either want to unlock pages (B_ASYNC
* request) or destroy a list of pages if an error occurred.
*/
void
pvn_read_done(page_t *plist, int flags)
{
page_t *pp;
while (plist != NULL) {
pp = plist;
page_sub(&plist, pp);
page_io_unlock(pp);
if (flags & B_ERROR) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else {
(void) page_release(pp, 0);
}
}
}
/*
* Automagic pageout.
* When memory gets tight, start freeing pages popping out of the
* write queue.
*/
int write_free = 1;
pgcnt_t pages_before_pager = 200; /* LMXXX */
/*
* Routine to be called when page-out's complete.
* The caller, typically VOP_PUTPAGE, has to explicity call this routine
* after waiting for i/o to complete (biowait) to free the list of
* pages associated with the buffer. These pages must be locked
* before i/o is initiated.
*
* If a write error occurs, the pages are marked as modified
* so the write will be re-tried later.
*/
void
pvn_write_done(page_t *plist, int flags)
{
int dfree = 0;
int pgrec = 0;
int pgout = 0;
int pgpgout = 0;
int anonpgout = 0;
int anonfree = 0;
int fspgout = 0;
int fsfree = 0;
int execpgout = 0;
int execfree = 0;
page_t *pp;
struct cpu *cpup;
struct vnode *vp = NULL; /* for probe */
uint_t ppattr;
kmutex_t *vphm = NULL;
ASSERT((flags & B_READ) == 0);
/*
* If we are about to start paging anyway, start freeing pages.
*/
if (write_free && freemem < lotsfree + pages_before_pager &&
(flags & B_ERROR) == 0) {
flags |= B_FREE;
}
/*
* Handle each page involved in the i/o operation.
*/
while (plist != NULL) {
pp = plist;
ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
page_sub(&plist, pp);
/* Kernel probe support */
if (vp == NULL)
vp = pp->p_vnode;
if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
/*
* Move page to the top of the v_page list.
* Skip pages modified during IO.
*/
vphm = page_vnode_mutex(vp);
mutex_enter(vphm);
if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
page_vpsub(&vp->v_pages, pp);
page_vpadd(&vp->v_pages, pp);
}
mutex_exit(vphm);
}
if (flags & B_ERROR) {
/*
* Write operation failed. We don't want
* to destroy (or free) the page unless B_FORCE
* is set. We set the mod bit again and release
* all locks on the page so that it will get written
* back again later when things are hopefully
* better again.
* If B_INVAL and B_FORCE is set we really have
* to destroy the page.
*/
if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
page_io_unlock(pp);
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else {
hat_setmod_only(pp);
page_io_unlock(pp);
page_unlock(pp);
}
} else if (flags & B_INVAL) {
/*
* XXX - Failed writes with B_INVAL set are
* not handled appropriately.
*/
page_io_unlock(pp);
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
/*
* Update statistics for pages being paged out
*/
if (pp->p_vnode) {
if (IS_SWAPFSVP(pp->p_vnode)) {
anonpgout++;
} else {
if (pp->p_vnode->v_flag & VVMEXEC) {
execpgout++;
} else {
fspgout++;
}
}
}
page_io_unlock(pp);
pgout = 1;
pgpgout++;
TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
"page_ws_out:pp %p", pp);
/*
* The page_struct_lock need not be acquired to
* examine "p_lckcnt" and "p_cowcnt" since we'll
* have an "exclusive" lock if the upgrade succeeds.
*/
if (page_tryupgrade(pp) &&
pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
/*
* Check if someone has reclaimed the
* page. If ref and mod are not set, no
* one is using it so we can free it.
* The rest of the system is careful
* to use the NOSYNC flag to unload
* translations set up for i/o w/o
* affecting ref and mod bits.
*
* Obtain a copy of the real hardware
* mod bit using hat_pagesync(pp, HAT_DONTZERO)
* to avoid having to flush the cache.
*/
ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
HAT_SYNC_STOPON_MOD);
ck_refmod:
if (!(ppattr & (P_REF | P_MOD))) {
if (hat_page_is_mapped(pp)) {
/*
* Doesn't look like the page
* was modified so now we
* really have to unload the
* translations. Meanwhile
* another CPU could've
* modified it so we have to
* check again. We don't loop
* forever here because now
* the translations are gone
* and no one can get a new one
* since we have the "exclusive"
* lock on the page.
*/
(void) hat_pageunload(pp,
HAT_FORCE_PGUNLOAD);
ppattr = hat_page_getattr(pp,
P_REF | P_MOD);
goto ck_refmod;
}
/*
* Update statistics for pages being
* freed
*/
if (pp->p_vnode) {
if (IS_SWAPFSVP(pp->p_vnode)) {
anonfree++;
} else {
if (pp->p_vnode->v_flag
& VVMEXEC) {
execfree++;
} else {
fsfree++;
}
}
}
/*LINTED: constant in conditional ctx*/
VN_DISPOSE(pp, B_FREE,
(flags & B_DONTNEED), kcred);
dfree++;
} else {
page_unlock(pp);
pgrec++;
TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
"page_ws_free:pp %p", pp);
}
} else {
/*
* Page is either `locked' in memory
* or was reclaimed and now has a
* "shared" lock, so release it.
*/
page_unlock(pp);
}
} else {
/*
* Neither B_FREE nor B_INVAL nor B_ERROR.
* Just release locks.
*/
page_io_unlock(pp);
page_unlock(pp);
}
}
CPU_STATS_ENTER_K();
cpup = CPU; /* get cpup now that CPU cannot change */
CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
CPU_STATS_EXIT_K();
/* Kernel probe */
TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
tnf_opaque, vnode, vp,
tnf_ulong, pages_pageout, pgpgout,
tnf_ulong, pages_freed, dfree,
tnf_ulong, pages_reclaimed, pgrec);
}
/*
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
* B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
* operation and is only to be considered if it doesn't involve any
* waiting here. B_TRUNC indicates that the file is being truncated
* and so no i/o needs to be done. B_FORCE indicates that the page
* must be destroyed so don't try wrting it out.
*
* The caller must ensure that the page is locked. Returns 1, if
* the page should be written back (the "iolock" is held in this
* case), or 0 if the page has been dealt with or has been
* unlocked.
*/
int
pvn_getdirty(page_t *pp, int flags)
{
ASSERT((flags & (B_INVAL | B_FREE)) ?
PAGE_EXCL(pp) : PAGE_SHARED(pp));
ASSERT(PP_ISFREE(pp) == 0);
/*
* If trying to invalidate or free a logically `locked' page,
* forget it. Don't need page_struct_lock to check p_lckcnt and
* p_cowcnt as the page is exclusively locked.
*/
if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
(pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
page_unlock(pp);
return (0);
}
/*
* Now acquire the i/o lock so we can add it to the dirty
* list (if necessary). We avoid blocking on the i/o lock
* in the following cases:
*
* If B_DELWRI is set, which implies that this request is
* due to a klustering operartion.
*
* If this is an async (B_ASYNC) operation and we are not doing
* invalidation (B_INVAL) [The current i/o or fsflush will ensure
* that the the page is written out].
*/
if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
if (!page_io_trylock(pp)) {
page_unlock(pp);
return (0);
}
} else {
page_io_lock(pp);
}
/*
* If we want to free or invalidate the page then
* we need to unload it so that anyone who wants
* it will have to take a minor fault to get it.
* Otherwise, we're just writing the page back so we
* need to sync up the hardwre and software mod bit to
* detect any future modifications. We clear the
* software mod bit when we put the page on the dirty
* list.
*/
if (flags & (B_INVAL | B_FREE)) {
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
} else {
(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
}
if (!hat_ismod(pp) || (flags & B_TRUNC)) {
/*
* Don't need to add it to the
* list after all.
*/
page_io_unlock(pp);
if (flags & B_INVAL) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else if (flags & B_FREE) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
} else {
/*
* This is advisory path for the callers
* of VOP_PUTPAGE() who prefer freeing the
* page _only_ if no one else is accessing it.
* E.g. segmap_release()
*
* The above hat_ismod() check is useless because:
* (1) we may not be holding SE_EXCL lock;
* (2) we've not unloaded _all_ translations
*
* Let page_release() do the heavy-lifting.
*/
(void) page_release(pp, 1);
}
return (0);
}
/*
* Page is dirty, get it ready for the write back
* and add page to the dirty list.
*/
hat_clrrefmod(pp);
/*
* If we're going to free the page when we're done
* then we can let others try to use it starting now.
* We'll detect the fact that they used it when the
* i/o is done and avoid freeing the page.
*/
if (flags & B_FREE)
page_downgrade(pp);
TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
return (1);
}
/*ARGSUSED*/
static int
marker_constructor(void *buf, void *cdrarg, int kmflags)
{
page_t *mark = buf;
bzero(mark, sizeof (page_t));
mark->p_hash = PVN_VPLIST_HASH_TAG;
return (0);
}
void
pvn_init()
{
if (pvn_vmodsort_disable == 0)
pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
marker_cache = kmem_cache_create("marker_cache",
sizeof (page_t), 0, marker_constructor,
NULL, NULL, NULL, NULL, 0);
}
/*
* Process a vnode's page list for all pages whose offset is >= off.
* Pages are to either be free'd, invalidated, or written back to disk.
*
* An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
* is specified, otherwise they are "shared" locked.
*
* Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
*
* Special marker page_t's are inserted in the list in order
* to keep track of where we are in the list when locks are dropped.
*
* Note the list is circular and insertions can happen only at the
* head and tail of the list. The algorithm ensures visiting all pages
* on the list in the following way:
*
* Drop two marker pages at the end of the list.
*
* Move one marker page backwards towards the start of the list until
* it is at the list head, processing the pages passed along the way.
*
* Due to race conditions when the vphm mutex is dropped, additional pages
* can be added to either end of the list, so we'll continue to move
* the marker and process pages until it is up against the end marker.
*
* There is one special exit condition. If we are processing a VMODSORT
* vnode and only writing back modified pages, we can stop as soon as
* we run into an unmodified page. This makes fsync(3) operations fast.
*/
int
pvn_vplist_dirty(
vnode_t *vp,
u_offset_t off,
int (*putapage)(vnode_t *, page_t *, u_offset_t *,
size_t *, int, cred_t *),
int flags,
cred_t *cred)
{
page_t *pp;
page_t *mark; /* marker page that moves toward head */
page_t *end; /* marker page at end of list */
int err = 0;
int error;
kmutex_t *vphm;
se_t se;
page_t **where_to_move;
ASSERT(vp->v_type != VCHR);
if (vp->v_pages == NULL)
return (0);
/*
* Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
*
* Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
* from getting blocked while flushing pages to a dead NFS server.
*/
mutex_enter(&vp->v_lock);
if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
mutex_exit(&vp->v_lock);
return (EAGAIN);
}
while (vp->v_flag & VVMLOCK)
cv_wait(&vp->v_cv, &vp->v_lock);
if (vp->v_pages == NULL) {
mutex_exit(&vp->v_lock);
return (0);
}
vp->v_flag |= VVMLOCK;
mutex_exit(&vp->v_lock);
/*
* Set up the marker pages used to walk the list
*/
end = kmem_cache_alloc(marker_cache, KM_SLEEP);
end->p_vnode = vp;
end->p_offset = (u_offset_t)-2;
mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
mark->p_vnode = vp;
mark->p_offset = (u_offset_t)-1;
/*
* Grab the lock protecting the vnode's page list
* note that this lock is dropped at times in the loop.
*/
vphm = page_vnode_mutex(vp);
mutex_enter(vphm);
if (vp->v_pages == NULL)
goto leave;
/*
* insert the markers and loop through the list of pages
*/
page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
page_vpadd(&mark->p_vpnext, end);
for (;;) {
/*
* If only doing an async write back, then we can
* stop as soon as we get to start of the list.
*/
if (flags == B_ASYNC && vp->v_pages == mark)
break;
/*
* otherwise stop when we've gone through all the pages
*/
if (mark->p_vpprev == end)
break;
pp = mark->p_vpprev;
if (vp->v_pages == pp)
where_to_move = &vp->v_pages;
else
where_to_move = &pp->p_vpprev->p_vpnext;
ASSERT(pp->p_vnode == vp);
/*
* If just flushing dirty pages to disk and this vnode
* is using a sorted list of pages, we can stop processing
* as soon as we find an unmodified page. Since all the
* modified pages are visited first.
*/
if (IS_VMODSORT(vp) &&
!(flags & (B_INVAL | B_FREE | B_TRUNC))) {
if (!hat_ismod(pp) && !page_io_locked(pp)) {
#ifdef DEBUG
/*
* For debug kernels examine what should be
* all the remaining clean pages, asserting
* that they are not modified.
*/
page_t *chk = pp;
int attr;
page_vpsub(&vp->v_pages, mark);
page_vpadd(where_to_move, mark);
do {
chk = chk->p_vpprev;
ASSERT(chk != end);
if (chk == mark)
continue;
attr = hat_page_getattr(chk, P_MOD |
P_REF);
if ((attr & P_MOD) == 0)
continue;
panic("v_pages list not all clean: "
"page_t*=%p vnode=%p off=%lx "
"attr=0x%x last clean page_t*=%p\n",
(void *)chk, (void *)chk->p_vnode,
(long)chk->p_offset, attr,
(void *)pp);
} while (chk != vp->v_pages);
#endif
break;
} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
/*
* Couldn't get io lock, wait until IO is done.
* Block only for sync IO since we don't want
* to block async IO.
*/
mutex_exit(vphm);
page_io_wait(pp);
mutex_enter(vphm);
continue;
}
}
/*
* Skip this page if the offset is out of the desired range.
* Just move the marker and continue.
*/
if (pp->p_offset < off) {
page_vpsub(&vp->v_pages, mark);
page_vpadd(where_to_move, mark);
continue;
}
/*
* If we are supposed to invalidate or free this
* page, then we need an exclusive lock.
*/
se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
/*
* We must acquire the page lock for all synchronous
* operations (invalidate, free and write).
*/
if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
/*
* If the page_lock() drops the mutex
* we must retry the loop.
*/
if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
continue;
/*
* It's ok to move the marker page now.
*/
page_vpsub(&vp->v_pages, mark);
page_vpadd(where_to_move, mark);
} else {
/*
* update the marker page for all remaining cases
*/
page_vpsub(&vp->v_pages, mark);
page_vpadd(where_to_move, mark);
/*
* For write backs, If we can't lock the page, it's
* invalid or in the process of being destroyed. Skip
* it, assuming someone else is writing it.
*/
if (!page_trylock(pp, se))
continue;
}
ASSERT(pp->p_vnode == vp);
/*
* Successfully locked the page, now figure out what to
* do with it. Free pages are easily dealt with, invalidate
* if desired or just go on to the next page.
*/
if (PP_ISFREE(pp)) {
if ((flags & B_INVAL) == 0) {
page_unlock(pp);
continue;
}
/*
* Invalidate (destroy) the page.
*/
mutex_exit(vphm);
page_destroy_free(pp);
mutex_enter(vphm);
continue;
}
/*
* pvn_getdirty() figures out what do do with a dirty page.
* If the page is dirty, the putapage() routine will write it
* and will kluster any other adjacent dirty pages it can.
*
* pvn_getdirty() and `(*putapage)' unlock the page.
*/
mutex_exit(vphm);
if (pvn_getdirty(pp, flags)) {
error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
if (!err)
err = error;
}
mutex_enter(vphm);
}
page_vpsub(&vp->v_pages, mark);
page_vpsub(&vp->v_pages, end);
leave:
/*
* Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
*/
mutex_exit(vphm);
kmem_cache_free(marker_cache, mark);
kmem_cache_free(marker_cache, end);
mutex_enter(&vp->v_lock);
vp->v_flag &= ~VVMLOCK;
cv_broadcast(&vp->v_cv);
mutex_exit(&vp->v_lock);
return (err);
}
/*
* Walk the vp->v_pages list, for every page call the callback function
* pointed by *page_check. If page_check returns non-zero, then mark the
* page as modified and if VMODSORT is set, move it to the end of v_pages
* list. Moving makes sense only if we have at least two pages - this also
* avoids having v_pages temporarily being NULL after calling page_vpsub()
* if there was just one page.
*/
void
pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
{
page_t *pp, *next, *end;
kmutex_t *vphm;
int shuffle;
vphm = page_vnode_mutex(vp);
mutex_enter(vphm);
if (vp->v_pages == NULL) {
mutex_exit(vphm);
return;
}
end = vp->v_pages->p_vpprev;
shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
pp = vp->v_pages;
for (;;) {
next = pp->p_vpnext;
if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
/*
* hat_setmod_only() in contrast to hat_setmod() does
* not shuffle the pages and does not grab the mutex
* page_vnode_mutex. Exactly what we need.
*/
hat_setmod_only(pp);
if (shuffle) {
page_vpsub(&vp->v_pages, pp);
ASSERT(vp->v_pages != NULL);
page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
pp);
}
}
/* Stop if we have just processed the last page. */
if (pp == end)
break;
pp = next;
}
mutex_exit(vphm);
}
/*
* Zero out zbytes worth of data. Caller should be aware that this
* routine may enter back into the fs layer (xxx_getpage). Locks
* that the xxx_getpage routine may need should not be held while
* calling this.
*/
void
pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
{
caddr_t addr;
ASSERT(vp->v_type != VCHR);
if (vp->v_pages == NULL)
return;
/*
* zbytes may be zero but there still may be some portion of
* a page which needs clearing (since zbytes is a function
* of filesystem block size, not pagesize.)
*/
if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
return;
/*
* We get the last page and handle the partial
* zeroing via kernel mappings. This will make the page
* dirty so that we know that when this page is written
* back, the zeroed information will go out with it. If
* the page is not currently in memory, then the kzero
* operation will cause it to be brought it. We use kzero
* instead of bzero so that if the page cannot be read in
* for any reason, the system will not panic. We need
* to zero out a minimum of the fs given zbytes, but we
* might also have to do more to get the entire last page.
*/
if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
panic("pvn_vptrunc zbytes");
addr = segmap_getmapflt(segkmap, vp, vplen,
MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
(void) kzero(addr + (vplen & MAXBOFFSET),
MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
}
/*
* Handles common work of the VOP_GETPAGE routines by iterating page by page
* calling the getpage helper for each.
*/
int
pvn_getpages(
int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
struct vnode *vp,
u_offset_t off,
size_t len,
uint_t *protp,
page_t *pl[],
size_t plsz,
struct seg *seg,
caddr_t addr,
enum seg_rw rw,
struct cred *cred)
{
page_t **ppp;
u_offset_t o, eoff;
size_t sz, xlen;
int err;
/* ensure that we have enough space */
ASSERT(pl == NULL || plsz >= len);
/*
* Loop one page at a time and let getapage function fill
* in the next page in array. We only allow one page to be
* returned at a time (except for the last page) so that we
* don't have any problems with duplicates and other such
* painful problems. This is a very simple minded algorithm,
* but it does the job correctly. We hope that the cost of a
* getapage call for a resident page that we might have been
* able to get from an earlier call doesn't cost too much.
*/
ppp = pl;
sz = (pl != NULL) ? PAGESIZE : 0;
eoff = off + len;
xlen = len;
for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
xlen -= PAGESIZE) {
if (o + PAGESIZE >= eoff && pl != NULL) {
/*
* Last time through - allow the all of
* what's left of the pl[] array to be used.
*/
sz = plsz - (o - off);
}
err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
rw, cred);
if (err) {
/*
* Release any pages we already got.
*/
if (o > off && pl != NULL) {
for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
(void) page_release(*ppp, 1);
}
break;
}
if (pl != NULL)
ppp++;
}
return (err);
}
/*
* Initialize the page list array.
*/
/*ARGSUSED*/
void
pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
u_offset_t off, size_t io_len, enum seg_rw rw)
{
ssize_t sz;
page_t *ppcur, **ppp;
/*
* Set up to load plsz worth
* starting at the needed page.
*/
while (pp != NULL && pp->p_offset != off) {
/*
* Remove page from the i/o list,
* release the i/o and the page lock.
*/
ppcur = pp;
page_sub(&pp, ppcur);
page_io_unlock(ppcur);
(void) page_release(ppcur, 1);
}
if (pp == NULL) {
pl[0] = NULL;
return;
}
sz = plsz;
/*
* Initialize the page list array.
*/
ppp = pl;
do {
ppcur = pp;
*ppp++ = ppcur;
page_sub(&pp, ppcur);
page_io_unlock(ppcur);
if (rw != S_CREATE)
page_downgrade(ppcur);
sz -= PAGESIZE;
} while (sz > 0 && pp != NULL);
*ppp = NULL; /* terminate list */
/*
* Now free the remaining pages that weren't
* loaded in the page list.
*/
while (pp != NULL) {
ppcur = pp;
page_sub(&pp, ppcur);
page_io_unlock(ppcur);
(void) page_release(ppcur, 1);
}
}