/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#ifdef _KERNEL
#include <sys/sysmacros.h>
#include <sys/fssnap_if.h>
#include <vm/seg_kmem.h>
#else /* _KERNEL */
#endif /* _KERNEL */
#ifdef _KERNEL
/*
* Used to verify that a given entry on the ufs_instances list (see below)
* still refers to a mounted file system.
*
* XXX: This is a crock that substitutes for proper locking to coordinate
* updates to and uses of the entries in ufs_instances.
*/
struct check_node {
};
/*
* All ufs file system instances are linked together into a list starting at
* ufs_instances. The list is updated as part of mount and unmount. It's
* consulted in ufs_update, to allow syncing out all ufs file system instances
* in a batch.
*
* ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
* manipulated in ufs_funmount_cleanup. (A given ufs instance is always on
* exactly one of these lists except while it's being allocated or
* deallocated.)
*/
/*
* ufsvfs list manipulation routines
*/
/*
* Link ufsp in at the head of the list of ufs_instances.
*/
void
{
}
/*
* Remove ufsp from the list of ufs_instances.
*
* Does no error checking; ufsp is assumed to actually be on the list.
*/
void
{
break;
}
}
}
/*
* Clean up state resulting from a forcible unmount that couldn't be handled
* directly during the unmount. (See commentary in the unmount code for more
* info.)
*/
static void
{
/*
* Assumption: it's now safe to blow away the entries on
* oldufsvfslist.
*/
}
/*
* Rotate more recent unmount entries into place in preparation for
* the next time around.
*/
ufsvfslist = NULL;
}
/*
* ufs_update performs the ufs part of `sync'. It goes through the disk
* queues to initiate sandbagged IO; goes through the inodes to write
* modified nodes; and it goes through the mount table to initiate
* the writing of the modified super blocks.
*/
extern kmutex_t ufs_scan_lock;
void
{
int check_cnt = 0;
/*
* This is a hack. A design flaw in the forced unmount protocol
* could allow a thread to attempt to use a kmem_freed ufsvfs
* structure in ufs_lockfs_begin/ufs_check_lockfs. This window
* is difficult to hit, even during the lockfs stress tests.
* So the hacky fix is to wait awhile before kmem_free'ing the
* ufsvfs structures for forcibly unmounted file systems. `Awhile'
* is defined as every other call from fsflush (~60 seconds).
*/
if (cheap)
/*
* Examine all ufsvfs structures and add those that we can lock to the
* update list. This is so that we don't hold the list lock for a
* long time. If vfs_lock fails for a file system instance, then skip
* it because somebody is doing a unmount on it.
*/
continue;
update_list = ufsp;
check_cnt++;
}
if (update_list == NULL)
return;
/*
* Write back modified superblocks.
* Consistency check that the superblock of
* each file system is still in the buffer cache.
*
* Note that the update_list traversal is done without the protection
* of an overall list lock, so it's necessary to rely on the fact that
* each entry of the list is vfs_locked when moving from one entry to
* the next. This works because a concurrent attempt to add an entry
* to another thread's update_list won't find it, since it'll already
* be locked.
*/
check_cnt = 0;
/*
* Need to grab the next ptr before we unlock this one so
* another thread doesn't grab it and change it before we move
* on to the next vfs. (Once we unlock it, it's ok if another
* thread finds it to add it to its own update_list; we don't
* attempt to refer to it through our list any more.)
*/
/*
* Seems like this can't happen, so perhaps it should become
* an ASSERT(vfsp->vfs_data != NULL).
*/
continue;
}
/*
* don't update a locked superblock during a panic; it
* may be in an inconsistent state
*/
if (panicstr) {
continue;
}
} else
/*
* Build up the STABLE check list, so we can unlock the vfs
* until we do the actual checking.
*/
if (check_list != NULL) {
ptr++;
check_cnt++;
}
}
/*
* superblock is not modified
*/
continue;
}
/*
* XXX: Why is this a return instead of a continue?
* This may be an attempt to replace a panic with
* something less drastic, but there's cleanup we
* should be doing that's not being done (e.g.,
* unlocking the remaining entries on the list).
*/
return;
}
}
/*
* Avoid racing with ufs_unmount() and ufs_sync().
*/
NULL);
/*
* Force stale buffer cache information to be flushed,
* for all devices. This should cause any remaining control
* information (e.g., cg and inode info) to be flushed back.
*/
if (check_list == NULL)
return;
/*
* For each UFS filesystem in the STABLE check_list, update
* the clean flag if warranted.
*/
int error;
/*
* still_mounted() returns with vfsp and the vfs_reflock
* held if ptr refers to a vfs that is still mounted.
*/
continue;
/*
* commit any outstanding async transactions
*/
error);
if (!error) {
}
}
}
int
{
return (0);
/*
* if we are panic'ing; then don't update the inode if this
* file system is FSSTABLE. Otherwise, we would have to
* force the superblock to FSACTIVE and the superblock
* may not be in a good state. Also, if the inode is
* IREF'ed then it may be in an inconsistent state. Don't
* push it. Finally, don't push the inode if the fs is
* logging; the transaction will be discarded at boot.
*/
if (panicstr) {
return (0);
return (0);
}
/*
* Limit access time only updates
*/
/*
* if file system has deferred access time turned on and there
* was no IO recently, don't bother flushing it. It will be
*/
return (0);
/*
* an app issueing a sync() can take forever on a trans device
* when NetWorker or find is running because all of the
* directorys' access times have to be updated. So, we limit
* the time we spend updating access times per sync.
*/
ufs_sync_time_secs) < time))
return (0);
}
/*
* if we are running on behalf of the flush thread or this is
* a swap file, then simply do a delay update of the inode.
* Otherwise, push the pages and then do a delayed inode update.
*/
TRANS_IUPDAT(ip, 0);
} else {
}
return (0);
}
/*
* Flush all the pages associated with an inode using the given 'flags',
* then force inode information to be written back using the given 'waitfor'.
*/
int
{
int error;
int dotrans = 0;
/*
* Return if file system has been forcibly umounted.
*/
return (EIO);
/*
* don't need to VOP_PUTPAGE if there are no pages
*/
error = 0;
} else {
/*
* if the inode we're working on is a shadow inode
* or quota inode we need to make sure that the
* ufs_putpage call is inside a transaction as this
* could include meta data changes.
*/
dotrans = 1;
}
if (dotrans) {
dotrans = 0;
}
}
goto out;
/*
* waitfor represents two things -
* 1. whether data sync or file sync.
* 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
*/
/*
* inode update, fdatasync()/FDSYNC implementation.
*/
/*
* Enter a transaction to provide mutual exclusion
* with deltamap_push and avoid a race where
* the inode flush could get dropped.
*/
dotrans = 1;
}
if (dotrans) {
}
}
} else {
/* For file sync, any inode change requires inode update */
/*
* Enter a transaction to provide mutual exclusion
* with deltamap_push and avoid a race where
* the inode flush could get dropped.
*/
dotrans = 1;
}
if (dotrans) {
}
}
}
out:
return (error);
}
/*
* Flush all indirect blocks related to an inode.
* Supports triple indirect blocks also.
*/
int
{
int i;
int bsize;
int j;
/*
* unnecessary when logging; allocation blocks are kept up-to-date
*/
if (TRANS_ISTRANS(ufsvfsp))
return (0);
return (0); /* No indirect blocks used */
/* File has one indirect block. */
return (0);
}
/* Write out all the first level indirect blocks */
for (i = 0; i < NIADDR; i++) {
continue;
}
/* Write out second level of indirect blocks */
return (0);
return (EIO);
}
break;
continue;
}
/* write out third level indirect blocks */
return (0);
return (EIO);
}
break;
if ((indirect_blkno = bap[i]) == 0)
continue;
return (EIO);
}
break;
if ((blkno = indirect_bap[j]) == 0)
continue;
}
}
return (0);
}
/*
* Flush all indirect blocks related to an offset of a file.
*/
int
{
int i, j, shft;
/*
* unnecessary when logging; allocation blocks are kept up-to-date
*/
if (TRANS_ISTRANS(ufsvfsp))
return (0);
if (lbn < 0)
return (EFBIG);
/* The first NDADDR are direct so nothing to do */
return (0);
/* Determine level of indirect blocks */
shft = 0;
for (j = NIADDR; j > 0; j--) {
shft += nindirshift;
break;
}
if (j == 0)
return (EFBIG);
return (0); /* UFS Hole */
/* Flush first level indirect block */
/* Fetch through next levels */
for (; j < NIADDR; j++) {
return (EIO);
}
if (nb == 0) {
return (0); /* UFS hole */
}
}
return (0);
}
#ifdef DEBUG
/*
* The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
* are very expensive. It's been found from profiling that we're
* spending 6-7% of our time in ufs_badblock, and another 1-2% in
* ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
* In addition from experience no failures have been found in recent
* years. So the following tunable can be set to enable checking.
*/
int ufs_badblock_checks = 0;
/*
* Check that a given indirect block contains blocks in range
*/
int
{
int i;
int err = 0;
if (ufs_badblock_checks) {
break;
}
return (err);
}
/*
* Check that a specified block number is in range.
*/
int
{
long c;
if (!ufs_badblock_checks)
return (0);
return (bn);
sum = 0;
if (c == 0) {
}
/*
* if block no. is below this cylinder group,
* within the space reserved for superblock, inodes, (summary data)
* or if it is above this cylinder group
* then its invalid
* It's hard to see how we'd be outside this cyl, but let's be careful.
*/
return (bn);
return (0); /* not a bad block */
}
#endif /* DEBUG */
/*
* When i_rwlock is write-locked or has a writer pended, then the inode
* is going to change in a way that the filesystem will be marked as
* active. So no need to let the filesystem be mark as stable now.
* Also to ensure the filesystem consistency during the directory
* operations, filesystem cannot be marked as stable if i_rwlock of
* the directory inode is write-locked.
*/
/*
* Check for busy inodes for this filesystem.
* NOTE: Needs better way to do this expensive operation in the future.
*/
static void
{
int i;
mutex_enter(&ih_lock[i]);
/*
*/
continue;
isbusy = 1;
isreclaim = 1;
break;
}
mutex_exit(&ih_lock[i]);
break;
}
*isreclaimp = isreclaim;
}
/*
* As part of the ufs 'sync' operation, this routine is called to mark
* the filesystem as STABLE if there is no modified metadata in memory.
*/
void
{
int isbusy;
int isreclaim;
int updatesb;
/*
* filesystem is stable or cleanflag processing is disabled; do nothing
* no transitions when panic'ing
*/
return;
/*
* if logging and nothing to reclaim; do nothing
*/
return;
/*
* FS_CHECKCLEAN is reset if the file system goes dirty
* FS_CHECKRECLAIM is reset if a file gets deleted
*/
updatesb = 0;
/*
* if logging or buffers are busy; do nothing
*/
isbusy = 1;
/*
* isreclaim == TRUE means can't change the state of fs_reclaim
*/
/*
* if fs is busy or can't change the state of fs_reclaim; do nothing
*/
return;
/*
* look for busy or deleted inodes; (deleted == needs reclaim)
*/
/*
* IF POSSIBLE, RESET RECLAIM
*/
/*
* the reclaim thread is not running
*/
/*
* no files were deleted during the scan
*/
/*
* no deleted files were found in the inode cache
*/
updatesb = 1;
}
/*
* IF POSSIBLE, SET STABLE
*/
/*
* not logging
*/
/*
* file system has not gone dirty since the scan began
*/
/*
* nothing dirty was found in the buffer or inode cache
*/
updatesb = 1;
}
if (updatesb) {
}
}
/*
* called whenever an unlink occurs
*/
void
{
return;
/*
* reclaim-needed bit is already set or we need to tell
* ufs_checkclean that a file has been deleted
*/
return;
/*
* inform ufs_checkclean that the file system has gone dirty
*/
/*
* set the reclaim-needed bit
*/
}
}
/*
* Before any modified metadata written back to the disk, this routine
* is called to mark the filesystem as ACTIVE.
*/
void
{
/*
* inform ufs_checkclean that the file system has gone dirty
*/
/*
* ignore if active or bad or suspended or readonly or logging
*/
return;
}
/*
* write superblock synchronously
*/
}
/*
* ufs specific fbwrite()
*/
int
{
if (TRANS_ISTRANS(ufsvfsp))
}
/*
* ufs specific fbiwrite()
*/
int
{
int error;
}
/*
* Inlined version of fbiwrite()
*/
} else if (ufsvfsp->vfs_snapshot) {
} else {
(void) bdev_strategy(bp);
}
return (error);
}
/*
* Write the ufs superblock only.
*/
void
{
char sav_fs_fmod;
/*
* for ulockfs processing, limit the superblock writes
*/
/* try again later */
return;
}
/*
* update superblock timestamp and fs_clean checksum
* if marked FSBAD, we always want an erroneous
* checksum to force repair
*/
case FSCLEAN:
case FSSTABLE:
break;
case FSACTIVE:
case FSSUSPEND:
case FSBAD:
case FSLOG:
break;
default:
break;
}
/*
* reset incore only bits
*/
/*
* delta the whole superblock
*/
/*
* retain the incore state of fs_fmod; set the ondisk state to 0
*/
/*
* Don't release the buffer after written to the disk
*/
}
/*
* Returns vfs pointer if vfs still being mounted. vfs lock is held.
* Otherwise, returns NULL.
*
* For our purposes, "still mounted" means that the file system still appears
* on the list of UFS file system instances.
*/
static vfs_t *
{
continue;
/*
* Tentative match: verify it and try to lock. (It's not at
* all clear how the verification could fail, given that we've
* gotten this far. We would have had to reallocate the
* ufsvfs struct at hand for a new incarnation; is that really
* possible in the interval from constructing the check_node
* to here?)
*/
continue;
continue;
continue;
return (vfsp);
}
return (NULL);
}
int
{
return (0);
}
/*
* ufs_construct_si()
* Read each cylinder group in turn and construct the summary information
*/
static int
{
char *bufs;
int i, ncg;
/*
* Initialise the buffer headers
*/
}
/*
* Repeat while there are cylinder groups left to read.
*/
do {
/*
* Issue upto NSIBUF asynchronous reads
*/
} else {
(void) bdev_strategy(bp);
}
}
/*
* wait for each read to finish;
* check for errors and copy the csum info
*/
if (!error) {
}
}
if (error) {
goto err;
}
err:
return (error);
}
/*
* ufs_getsummaryinfo
*/
int
{
int i; /* `for' loop counter */
/*
* maintain metadata map for trans device (debug only)
*/
/*
* Compute #frags and allocate space for summary info
*/
/*
* The summary information is unknown, read it in from
* the cylinder groups.
*/
}
return (EIO);
}
} else {
/* Read summary info a fs block at a time */
/*
* This happens only the last iteration, so
* don't worry about size being reset
*/
return (EIO);
}
}
}
}
return (0);
}
/*
* ufs_putsummaryinfo() stores all the cylinder group summary information
* This is only used when logging, but the file system may not
* be logging at the time, eg a read-only mount to flush the log
* may push the summary info out.
*/
int
{
int i; /* `for' loop counter */
if (TRANS_ISERROR(ufsvfsp)) {
return (EIO);
}
return (0);
}
bp = &b;
/* Write summary info one fs block at a time */
/*
* This happens only the last iteration, so
* don't worry about size being reset
*/
}
(void) bdev_strategy(bp);
}
if (!error) {
}
return (error);
}
/*
* Decide whether it is okay to remove within a sticky directory.
* Two conditions need to be met: write access to the directory
* is needed. In sticky directories, write access is not sufficient;
* you can remove entries from a directory only if you own the directory,
* if you are privileged, if you own the entry or if the entry is
* a plain file and you have write access to that file.
* Function returns 0 if remove access is granted.
* Note, the caller is responsible for holding the i_contents lock
* at least as reader on the inquired inode 'ip'.
*/
int
{
return (secpolicy_vnode_remove(cr));
return (0);
}
#endif /* _KERNEL */
extern int around[9];
extern int inside[9];
/*
* Update the frsum fields to reflect addition or deletion
* of some frags.
*/
void
{
int inblk;
/*
* ufsvfsp->vfs_lock is held when calling this.
*/
fragmap <<= 1;
continue;
}
field <<= 1;
subfield <<= 1;
}
}
}
/*
* Block operations
*/
/*
* Check if a block is available
*/
int
{
/*
* ufsvfsp->vfs_lock is held when calling this.
*/
case 8:
return (cp[h] == 0xff);
case 4:
case 2:
case 1:
default:
#ifndef _KERNEL
#endif /* _KERNEL */
return (0);
}
}
/*
* Take a block out of the map
*/
void
{
/*
* ufsvfsp->vfs_lock is held when calling this.
*/
case 8:
cp[h] = 0;
return;
case 4:
return;
case 2:
return;
case 1:
return;
default:
#ifndef _KERNEL
#endif /* _KERNEL */
return;
}
}
/*
* Is block allocated?
*/
int
{
int frag;
/*
* ufsvfsp->vfs_lock is held when calling this.
*/
switch (frag) {
case 8:
return (cp[h] == 0);
case 4:
case 2:
case 1:
default:
#ifndef _KERNEL
#endif /* _KERNEL */
break;
}
return (0);
}
/*
* Put a block into the map
*/
void
{
/*
* ufsvfsp->vfs_lock is held when calling this.
*/
case 8:
cp[h] = 0xff;
return;
case 4:
return;
case 2:
return;
case 1:
return;
default:
#ifndef _KERNEL
#endif /* _KERNEL */
return;
}
}
int
{
if (len == 0)
return (0);
;
return (len);
}