/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* VM - paged vnode.
*
* This file supplies vm support for the vnode operations that deal with pages.
*/
#include <sys/sysmacros.h>
#include <sys/tnf_probe.h>
#include <vm/seg_kmem.h>
int pvn_nofodklust = 0;
int pvn_write_noklust = 0;
/* support for vmodsort for testing */
/*
* Find the largest contiguous block which contains `addr' for file offset
* `offset' in it while living within the file system block sizes (`vp_off'
* and `vp_len') and the address space limits for which no pages currently
* exist and which map to consecutive file offsets.
*/
page_t *
int isra)
{
/*
* We only want to do klustering/read ahead if there
* is more than minfree pages currently available.
*/
if (pagesavail <= 0)
if (isra)
else
/* We calculate in pages instead of bytes due to 32-bit overflows */
/*
* Don't have enough free memory for the
* max request, try sizing down vp request.
*/
/*
* Still not enough memory, just settle for
* pagesavail which is at least 1.
*/
}
}
} else {
/*
* Scan back from front by incrementing "deltab" and
* comparing "off" with "vp_off + deltab" to avoid
* "signed" versus "unsigned" conversion problems.
*/
/*
* Call back to the segment driver to verify that
* the klustering/read ahead operation makes sense.
*/
break; /* page not eligible */
== NULL)
break; /* already have the page */
/*
* Add page to front of page list.
*/
}
/* scan forward from front */
/*
* Call back to the segment driver to verify that
* the klustering/read ahead operation makes sense.
*/
break; /* page not file extension */
== NULL)
break; /* already have page */
/*
* Add page to end of page list.
*/
}
/*
* If we ended up getting more than was actually
* requested, retract the returned length to only
* reflect what was requested. This might happen
* if we were allowed to kluster pages across a
* span of (say) 5 frags, and frag size is less
* than PAGESIZE. We need a whole number of
* pages to contain those frags, but the returned
* size should only allow the returned range to
* extend as far as the end of the frags.
*/
}
}
"pvn_read_kluster:seg %p addr %x isra %x",
return (plist);
}
/*
* Handle pages for this vnode on either side of the page "pp"
* which has been locked by the caller. This routine will also
* do klustering in the range [vp_off, vp_off + vp_len] up
* until a page which is not found. The offset and length
* of pages included is returned in "*offp" and "*lenp".
*
* Returns a list of dirty locked pages all ready to be
* written back.
*/
page_t *
int flags)
{
/*
* Kustering should not be done if we are invalidating
* pages since we could destroy pages that belong to
* some other process if this is a swap vnode.
*/
return (pp);
}
else
/*
* Scan backwards looking for pages to kluster by incrementing
* "deltab" and comparing "off" with "vp_off + deltab" to
* avoid "signed" versus "unsigned" conversion problems.
*/
break; /* page not found */
break;
}
/* now scan forwards looking for pages to kluster */
break; /* page not found */
break;
}
return (dirty);
}
/*
* and the "p_iolock" on pages after i/o is complete.
*/
void
{
}
}
/*
* Entry point to be used by file system getpage subr's and
* other such routines which either want to unlock pages (B_ASYNC
* request) or destroy a list of pages if an error occurred.
*/
void
{
/*LINTED: constant in conditional context*/
} else {
(void) page_release(pp, 0);
}
}
}
/*
* Automagic pageout.
* When memory gets tight, start freeing pages popping out of the
* write queue.
*/
/*
* Routine to be called when page-out's complete.
* The caller, typically VOP_PUTPAGE, has to explicity call this routine
* after waiting for i/o to complete (biowait) to free the list of
* pages associated with the buffer. These pages must be locked
* before i/o is initiated.
*
* If a write error occurs, the pages are marked as modified
* so the write will be re-tried later.
*/
void
{
int dfree = 0;
int pgrec = 0;
int pgout = 0;
int pgpgout = 0;
int anonpgout = 0;
int anonfree = 0;
int fspgout = 0;
int fsfree = 0;
int execpgout = 0;
int execfree = 0;
/*
* If we are about to start paging anyway, start freeing pages.
*/
}
/*
* Handle each page involved in the i/o operation.
*/
/* Kernel probe support */
/*
* Move page to the top of the v_page list.
* Skip pages modified during IO.
*/
}
}
/*
* Write operation failed. We don't want
* to destroy (or free) the page unless B_FORCE
* is set. We set the mod bit again and release
* all locks on the page so that it will get written
* back again later when things are hopefully
* better again.
* If B_INVAL and B_FORCE is set we really have
* to destroy the page.
*/
/*LINTED: constant in conditional context*/
} else {
}
/*
* XXX - Failed writes with B_INVAL set are
* not handled appropriately.
*/
/*LINTED: constant in conditional context*/
/*
* Update statistics for pages being paged out
*/
anonpgout++;
} else {
execpgout++;
} else {
fspgout++;
}
}
}
pgout = 1;
pgpgout++;
"page_ws_out:pp %p", pp);
/*
* The page_struct_lock need not be acquired to
* examine "p_lckcnt" and "p_cowcnt" since we'll
* have an "exclusive" lock if the upgrade succeeds.
*/
if (page_tryupgrade(pp) &&
/*
* Check if someone has reclaimed the
* page. If ref and mod are not set, no
* one is using it so we can free it.
* The rest of the system is careful
* to use the NOSYNC flag to unload
* translations set up for i/o w/o
* affecting ref and mod bits.
*
* Obtain a copy of the real hardware
* mod bit using hat_pagesync(pp, HAT_DONTZERO)
* to avoid having to flush the cache.
*/
if (hat_page_is_mapped(pp)) {
/*
* Doesn't look like the page
* was modified so now we
* really have to unload the
* translations. Meanwhile
* another CPU could've
* modified it so we have to
* check again. We don't loop
* forever here because now
* the translations are gone
* and no one can get a new one
* since we have the "exclusive"
* lock on the page.
*/
(void) hat_pageunload(pp,
goto ck_refmod;
}
/*
* Update statistics for pages being
* freed
*/
anonfree++;
} else {
& VVMEXEC) {
execfree++;
} else {
fsfree++;
}
}
}
/*LINTED: constant in conditional ctx*/
dfree++;
} else {
pgrec++;
"page_ws_free:pp %p", pp);
}
} else {
/*
* Page is either `locked' in memory
* or was reclaimed and now has a
* "shared" lock, so release it.
*/
}
} else {
/*
* Neither B_FREE nor B_INVAL nor B_ERROR.
* Just release locks.
*/
}
}
/* Kernel probe */
}
/*
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
* B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
* operation and is only to be considered if it doesn't involve any
* waiting here. B_TRUNC indicates that the file is being truncated
* and so no i/o needs to be done. B_FORCE indicates that the page
* must be destroyed so don't try wrting it out.
*
* The caller must ensure that the page is locked. Returns 1, if
* the page should be written back (the "iolock" is held in this
* case), or 0 if the page has been dealt with or has been
* unlocked.
*/
int
{
/*
* If trying to invalidate or free a logically `locked' page,
* forget it. Don't need page_struct_lock to check p_lckcnt and
* p_cowcnt as the page is exclusively locked.
*/
return (0);
}
/*
* Now acquire the i/o lock so we can add it to the dirty
* list (if necessary). We avoid blocking on the i/o lock
* in the following cases:
*
* If B_DELWRI is set, which implies that this request is
* due to a klustering operartion.
*
* If this is an async (B_ASYNC) operation and we are not doing
* invalidation (B_INVAL) [The current i/o or fsflush will ensure
* that the the page is written out].
*/
if (!page_io_trylock(pp)) {
return (0);
}
} else {
}
/*
* If we want to free or invalidate the page then
* we need to unload it so that anyone who wants
* it will have to take a minor fault to get it.
* Otherwise, we're just writing the page back so we
* need to sync up the hardwre and software mod bit to
* detect any future modifications. We clear the
* software mod bit when we put the page on the dirty
* list.
*/
} else {
}
/*
* Don't need to add it to the
* list after all.
*/
/*LINTED: constant in conditional context*/
/*LINTED: constant in conditional context*/
} else {
/*
* This is advisory path for the callers
* of VOP_PUTPAGE() who prefer freeing the
* page _only_ if no one else is accessing it.
* E.g. segmap_release()
*
* The above hat_ismod() check is useless because:
* (1) we may not be holding SE_EXCL lock;
* (2) we've not unloaded _all_ translations
*
* Let page_release() do the heavy-lifting.
*/
}
return (0);
}
/*
* Page is dirty, get it ready for the write back
* and add page to the dirty list.
*/
/*
* If we're going to free the page when we're done
* then we can let others try to use it starting now.
* We'll detect the fact that they used it when the
* i/o is done and avoid freeing the page.
*/
return (1);
}
/*ARGSUSED*/
static int
{
return (0);
}
void
pvn_init()
{
if (pvn_vmodsort_disable == 0)
sizeof (page_t), 0, marker_constructor,
}
/*
* Process a vnode's page list for all pages whose offset is >= off.
* Pages are to either be free'd, invalidated, or written back to disk.
*
* An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
* is specified, otherwise they are "shared" locked.
*
* Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
*
* Special marker page_t's are inserted in the list in order
* to keep track of where we are in the list when locks are dropped.
*
* Note the list is circular and insertions can happen only at the
* head and tail of the list. The algorithm ensures visiting all pages
* on the list in the following way:
*
* Drop two marker pages at the end of the list.
*
* Move one marker page backwards towards the start of the list until
* it is at the list head, processing the pages passed along the way.
*
* Due to race conditions when the vphm mutex is dropped, additional pages
* can be added to either end of the list, so we'll continue to move
* the marker and process pages until it is up against the end marker.
*
* There is one special exit condition. If we are processing a VMODSORT
* vnode and only writing back modified pages, we can stop as soon as
* we run into an unmodified page. This makes fsync(3) operations fast.
*/
int
int flags,
{
int err = 0;
int error;
return (0);
/*
* Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
*
* Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
* from getting blocked while flushing pages to a dead NFS server.
*/
return (EAGAIN);
}
return (0);
}
/*
* Set up the marker pages used to walk the list
*/
/*
* Grab the lock protecting the vnode's page list
* note that this lock is dropped at times in the loop.
*/
goto leave;
/*
* insert the markers and loop through the list of pages
*/
for (;;) {
/*
* If only doing an async write back, then we can
* stop as soon as we get to start of the list.
*/
break;
/*
* otherwise stop when we've gone through all the pages
*/
break;
else
/*
* If just flushing dirty pages to disk and this vnode
* is using a sorted list of pages, we can stop processing
* as soon as we find an unmodified page. Since all the
* modified pages are visited first.
*/
if (IS_VMODSORT(vp) &&
#ifdef DEBUG
/*
* For debug kernels examine what should be
* all the remaining clean pages, asserting
* that they are not modified.
*/
int attr;
do {
continue;
P_REF);
continue;
panic("v_pages list not all clean: "
"page_t*=%p vnode=%p off=%lx "
"attr=0x%x last clean page_t*=%p\n",
(void *)pp);
#endif
break;
/*
* Couldn't get io lock, wait until IO is done.
* Block only for sync IO since we don't want
* to block async IO.
*/
continue;
}
}
/*
* Skip this page if the offset is out of the desired range.
* Just move the marker and continue.
*/
continue;
}
/*
* If we are supposed to invalidate or free this
* page, then we need an exclusive lock.
*/
/*
* We must acquire the page lock for all synchronous
* operations (invalidate, free and write).
*/
/*
* If the page_lock() drops the mutex
* we must retry the loop.
*/
continue;
/*
* It's ok to move the marker page now.
*/
} else {
/*
* update the marker page for all remaining cases
*/
/*
* For write backs, If we can't lock the page, it's
* invalid or in the process of being destroyed. Skip
* it, assuming someone else is writing it.
*/
continue;
}
/*
* Successfully locked the page, now figure out what to
* do with it. Free pages are easily dealt with, invalidate
* if desired or just go on to the next page.
*/
continue;
}
/*
* Invalidate (destroy) the page.
*/
continue;
}
/*
* pvn_getdirty() figures out what do do with a dirty page.
* If the page is dirty, the putapage() routine will write it
* and will kluster any other adjacent dirty pages it can.
*
* pvn_getdirty() and `(*putapage)' unlock the page.
*/
if (!err)
}
}
/*
* Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
*/
return (err);
}
/*
* Walk the vp->v_pages list, for every page call the callback function
* pointed by *page_check. If page_check returns non-zero, then mark the
* page as modified and if VMODSORT is set, move it to the end of v_pages
* list. Moving makes sense only if we have at least two pages - this also
* avoids having v_pages temporarily being NULL after calling page_vpsub()
* if there was just one page.
*/
void
{
int shuffle;
return;
}
for (;;) {
/*
* hat_setmod_only() in contrast to hat_setmod() does
* not shuffle the pages and does not grab the mutex
* page_vnode_mutex. Exactly what we need.
*/
if (shuffle) {
pp);
}
}
/* Stop if we have just processed the last page. */
break;
}
}
/*
* Zero out zbytes worth of data. Caller should be aware that this
* routine may enter back into the fs layer (xxx_getpage). Locks
* that the xxx_getpage routine may need should not be held while
* calling this.
*/
void
{
return;
/*
* zbytes may be zero but there still may be some portion of
* a page which needs clearing (since zbytes is a function
* of filesystem block size, not pagesize.)
*/
return;
/*
* We get the last page and handle the partial
* zeroing via kernel mappings. This will make the page
* dirty so that we know that when this page is written
* back, the zeroed information will go out with it. If
* the page is not currently in memory, then the kzero
* operation will cause it to be brought it. We use kzero
* instead of bzero so that if the page cannot be read in
* for any reason, the system will not panic. We need
* to zero out a minimum of the fs given zbytes, but we
* might also have to do more to get the entire last page.
*/
panic("pvn_vptrunc zbytes");
}
/*
* Handles common work of the VOP_GETPAGE routines by iterating page by page
* calling the getpage helper for each.
*/
int
{
int err;
/* ensure that we have enough space */
/*
* Loop one page at a time and let getapage function fill
* in the next page in array. We only allow one page to be
* returned at a time (except for the last page) so that we
* don't have any problems with duplicates and other such
* painful problems. This is a very simple minded algorithm,
* but it does the job correctly. We hope that the cost of a
* getapage call for a resident page that we might have been
* able to get from an earlier call doesn't cost too much.
*/
/*
* Last time through - allow the all of
* what's left of the pl[] array to be used.
*/
}
if (err) {
/*
* Release any pages we already got.
*/
}
break;
}
ppp++;
}
return (err);
}
/*
* Initialize the page list array.
*/
/*ARGSUSED*/
void
{
/*
* Set up to load plsz worth
* starting at the needed page.
*/
/*
* Remove page from the i/o list,
* release the i/o and the page lock.
*/
}
return;
}
/*
* Initialize the page list array.
*/
do {
/*
* Now free the remaining pages that weren't
* loaded in the page list.
*/
}
}