fsflush.c revision da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/tuneable.h>
#include <sys/inline.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/var.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/swap.h>
#include <sys/vm.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/sysinfo.h>
#include <sys/callb.h>
#include <sys/reboot.h>
#include <sys/time.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_bio.h>
#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/seg_kmem.h>
int doiflush = 1; /* non-zero to turn inode flushing on */
int dopageflush = 1; /* non-zero to turn page flushing on */
* To improve boot performance, don't run the inode flushing loop until
* the specified number of seconds after boot. To revert to the old
* behavior, set fsflush_iflush_delay to 0. We have not created any new
* filesystem danger that did not exist previously, since there is always a
* window in between when fsflush does the inode flush loop during which the
* system could crash, fail to sync the filesystem, and fsck will be needed
* to recover. We have, however, widened this window. Finally,
* we never delay inode flushing if we're booting into single user mode,
* where the administrator may be modifying files or using fsck. This
* modification avoids inode flushes during boot whose only purpose is to
* update atimes on files which have been accessed during boot.
int fsflush_iflush_delay = 60;
kcondvar_t fsflush_cv;
static kmutex_t fsflush_lock; /* just for the cv_wait */
ksema_t fsflush_sema; /* to serialize with reboot */
* some statistics for fsflush_do_pages
typedef struct {
ulong_t fsf_scan; /* number of pages scanned */
ulong_t fsf_examined; /* number of page_t's actually examined, can */
/* be less than fsf_scan due to large pages */
ulong_t fsf_locked; /* pages we actually page_lock()ed */
ulong_t fsf_modified; /* number of modified pages found */
ulong_t fsf_coalesce; /* number of page coalesces done */
ulong_t fsf_time; /* nanoseconds of run time */
ulong_t fsf_releases; /* number of page_release() done */
} fsf_stat_t;
fsf_stat_t fsf_recent; /* counts for most recent duty cycle */
fsf_stat_t fsf_total; /* total of counts */
ulong_t fsf_cycles; /* number of runs refelected in fsf_total */
* data used to determine when we can coalesce consecutive free pages
* into larger pages.
#define MAX_PAGESIZES 32
static ulong_t fsf_npgsz;
static pgcnt_t fsf_pgcnt[MAX_PAGESIZES];
static pgcnt_t fsf_mask[MAX_PAGESIZES];
* Scan page_t's and issue I/O's for modified pages.
* Also coalesces consecutive small sized free pages into the next larger
* pagesize. This costs a tiny bit of time in fsflush, but will reduce time
* spent scanning on later passes and for anybody allocating large pages.
static void
vnode_t *vp;
ulong_t pcount;
hrtime_t timer = gethrtime();
ulong_t releases = 0;
ulong_t nexamined = 0;
ulong_t nlocked = 0;
ulong_t nmodified = 0;
ulong_t ncoalesce = 0;
int mod;
u_offset_t offset;
uint_t szc;
page_t *coal_page = NULL; /* 1st page in group to coalesce */
uint_t coal_szc = 0; /* size code, coal_page->p_szc */
uint_t coal_cnt = 0; /* count of pages seen */
static ulong_t nscan = 0;
static pgcnt_t last_total_pages = 0;
static void *pp_cookie = NULL;
static page_t *pp;
* Check to see if total_pages has changed.
if (total_pages != last_total_pages) {
last_total_pages = total_pages;
nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
* On first time through initialize the cookie used for page_t scans
if (pp_cookie == NULL)
pp = page_next_scan_init(&pp_cookie);
pcount = 0;
while (pcount <= nscan) {
* move to the next page, skipping over large pages
* and issuing prefetches.
pp = page_next_scan_large(pp, &pcount, &pp_cookie);
prefetch_page_r((void *)pp);
* Do a bunch of dirty tests (ie. no locking) to determine
* if we can quickly skip this page. These tests are repeated
* after acquiring the page lock.
if (PP_ISSWAP(pp)) {
coal_page = NULL;
* skip free pages too, but try coalescing them into larger
* pagesizes
if (PP_ISFREE(pp)) {
* skip pages with a file system identity or that
* are already maximum size
szc = pp->p_szc;
if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
coal_page = NULL;
* If not in a coalescing candidate page or the size
* codes are different, start a new candidate.
if (coal_page == NULL || coal_szc != szc) {
* page must be properly aligned
if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
coal_page = NULL;
coal_page = pp;
coal_szc = szc;
coal_cnt = 1;
* acceptable to add this to existing candidate page
if (coal_cnt < fsf_pgcnt[coal_szc])
* We've got enough pages to coalesce, so do it.
* After promoting, we clear coal_page, so it will
* take another pass to promote this to an even
* larger page.
(void) page_promote_size(coal_page, coal_szc);
coal_page = NULL;
} else {
coal_page = NULL;
if (PP_ISKAS(pp) ||
pp->p_lckcnt != 0 ||
pp->p_cowcnt != 0)
* Reject pages that can't be "exclusively" locked.
if (!page_trylock(pp, SE_EXCL))
* After locking the page, redo the above checks.
* Since we locked the page, leave out the PAGE_LOCKED() test.
vp = pp->p_vnode;
if (PP_ISSWAP(pp) ||
PP_ISFREE(pp) ||
vp == NULL ||
PP_ISKAS(pp) ||
pp->p_lckcnt != 0 ||
pp->p_cowcnt != 0 ||
(vp->v_flag & VISSWAP) != 0) {
ASSERT(vp->v_type != VCHR);
* Check the modified bit. Leaving the bit alone in hardware.
* It will be cleared if we do the putpage.
if (IS_VMODSORT(vp))
mod = hat_ismod(pp);
mod = hat_pagesync(pp,
if (mod) {
offset = pp->p_offset;
* Hold the vnode before releasing the page lock
* to prevent it from being freed and re-used by
* some other thread.
(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
kcred, NULL);
} else {
* Catch any pages which should be on the cache list,
* but aren't yet.
if (hat_page_is_mapped(pp) == 0) {
(void) page_release(pp, 1);
} else {
* maintain statistics
* reset every million wakeups, just to avoid overflow
if (++fsf_cycles == 1000000) {
fsf_cycles = 0;
fsf_total.fsf_scan = 0;
fsf_total.fsf_examined = 0;
fsf_total.fsf_locked = 0;
fsf_total.fsf_modified = 0;
fsf_total.fsf_coalesce = 0;
fsf_total.fsf_time = 0;
fsf_total.fsf_releases = 0;
} else {
fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
* As part of file system hardening, this daemon is awakened
* every second to flush cached data which includes the
* buffer cache, the inode cache and mapped pages.
struct buf *bp, *dwp;
struct hbuf *hp;
int autoup;
unsigned int ix, icount, count = 0;
callb_cpr_t cprinfo;
uint_t bcount;
kmutex_t *hmp;
struct vfssw *vswp;
proc_fsflush = ttoproc(curthread);
proc_fsflush->p_cstime = 0;
proc_fsflush->p_stime = 0;
proc_fsflush->p_cutime = 0;
proc_fsflush->p_utime = 0;
bcopy("fsflush", curproc->p_user.u_psargs, 8);
bcopy("fsflush", curproc->p_user.u_comm, 7);
mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
* Setup page coalescing.
fsf_npgsz = page_num_pagesizes();
for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
fsf_pgcnt[ix] =
page_get_pagesize(ix + 1) / page_get_pagesize(ix);
fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
autoup = v.v_autoup * hz;
icount = v.v_autoup / tune.t_fsflushr;
CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */
CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
* Write back all old B_DELWRI buffers on the freelist.
bcount = 0;
for (ix = 0; ix < v.v_hbuf; ix++) {
hp = &hbuf[ix];
dwp = (struct buf *)&dwbuf[ix];
bcount += (hp->b_length);
if (dwp->av_forw == dwp) {
hmp = &hbuf[ix].b_lock;
bp = dwp->av_forw;
* Go down only on the delayed write lists.
while (bp != dwp) {
ASSERT(bp->b_flags & B_DELWRI);
if ((bp->b_flags & B_DELWRI) &&
(lbolt - bp->b_start >= autoup) &&
sema_tryp(&bp->b_sem)) {
bp->b_flags |= B_ASYNC;
if (bp->b_vp == NULL) {
} else {
bp = dwp->av_forw;
} else {
bp = bp->av_forw;
* There is no need to wakeup any thread waiting on bio_mem_cv
* since brelse will wake them up as soon as IO is complete.
bfreelist.b_bcount = bcount;
if (dopageflush)
if (!doiflush)
goto loop;
* If the system was not booted to single user mode, skip the
* inode flushing until after fsflush_iflush_delay secs have elapsed.
if ((boothowto & RB_SINGLE) == 0 &&
(lbolt64 / hz) < fsflush_iflush_delay)
goto loop;
* Flush cached attribute information (e.g. inodes).
if (++count >= icount) {
count = 0;
* Sync back cached data.
for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
(void) fsop_sync_by_kind(vswp - vfssw,
SYNC_ATTR, kcred);
goto loop;