ufs_inode.c revision 52d549430a19e69ddf8ff7ed6d045200cb07d7f6
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
/* UFS Inode Cache Stats -- Not protected */
{ "size", KSTAT_DATA_ULONG },
{ "maxsize", KSTAT_DATA_ULONG },
{ "hits", KSTAT_DATA_ULONG },
{ "misses", KSTAT_DATA_ULONG },
{ "kmem allocs", KSTAT_DATA_ULONG },
{ "kmem frees", KSTAT_DATA_ULONG },
{ "maxsize reached", KSTAT_DATA_ULONG },
{ "puts at frontlist", KSTAT_DATA_ULONG },
{ "puts at backlist", KSTAT_DATA_ULONG },
{ "queues to free", KSTAT_DATA_ULONG },
{ "scans", KSTAT_DATA_ULONG },
{ "thread idles", KSTAT_DATA_ULONG },
{ "lookup idles", KSTAT_DATA_ULONG },
{ "vget idles", KSTAT_DATA_ULONG },
{ "cache allocs", KSTAT_DATA_ULONG },
{ "cache frees", KSTAT_DATA_ULONG },
{ "pushes at close", KSTAT_DATA_ULONG }
};
/* kstat data */
int inohsz; /* number of buckets in the hash table */
/*
* time to wait after ufsvfsp->vfs_iotstamp before declaring that no
*/
/*
* the threads that process idle inodes and free (deleted) inodes
* have high water marks that are set in ufsinit().
* These values but can be no less then the minimum shown below
*/
int ufs_idle_max; /* # of allowable idle inodes */
/*
* Tunables for ufs write throttling.
* These are validated in ufs_iinit() since improper settings
* can lead to filesystem hangs.
*/
int ufs_HW = UFS_HW_DEFAULT;
int ufs_LW = UFS_LW_DEFAULT;
static void ihinit(void);
extern int hash2ints(int, int);
struct cred *, int);
/* ARGSUSED */
static int
{
if (rw == KSTAT_WRITE)
return (EACCES);
"slab_alloc");
"slab_free");
"alloc");
"free");
"buf_inuse");
"buf_max");
return (0);
}
void
ufs_iinit(void)
{
/*
* Validate that ufs_HW > ufs_LW.
* The default values for these two tunables have been increased.
* There is now a range of values for ufs_HW that used to be
* legal on previous Solaris versions but no longer is now.
* from that range can lead to filesystem hangs unless the values
* are checked here.
*/
}
/*
* Adjust the tunable `ufs_ninode' to a reasonable value
*/
if (ufs_ninode <= 0)
ufs_ninode = ncsize;
if (ufs_inode_max == 0)
}
/*
* going on. This allows deferred access times to be flushed to disk.
*/
/*
* idle thread runs when 25% of ufs_ninode entries are on the queue
*/
if (ufs_idle_max == 0)
if (ufs_idle_max < UFS_IDLE_MAX)
if (ufs_idle_max > ufs_ninode)
/*
* This is really a misnomer, it is ufs_queue_init
*/
/*
* global hlock thread
*/
ihinit();
qtinit();
KSTAT_FLAG_VIRTUAL)) != NULL) {
}
ufsfx_init(); /* fix-on-panic initialization */
lufs_init();
}
/* ARGSUSED */
static int
{
return (0);
}
/* ARGSUSED */
static void
{
}
}
/*
* Initialize hash links for inodes
* and build inode free list.
*/
void
ihinit(void)
{
int i;
}
sizeof (struct inode), 0, ufs_inode_cache_constructor,
}
/*
* Free an inode structure
*/
void
{
}
/*
* Allocate an inode structure
*/
struct inode *
{
/*
* at this point we have a newly allocated inode
*/
ip->i_delaylen = 0;
ip->i_delayoff = 0;
/*
* the vnode for this inode was allocated by the constructor
*/
return (ip);
}
/*
* Look up an inode by device, inumber. If it is in core (in the
* inode structure), honor the locking protocol. If it is not in
* core, read it in from the specified device after freeing any pages.
* In all cases, a pointer to a VN_HELD inode structure is returned.
*/
int
{
}
/*
* A version of ufs_iget which returns only allocated, linked inodes.
* This is appropriate for any callers who do not expect a free inode.
*/
int
{
}
/*
* Set vnode attributes based on v_type, this should be called whenever
* an inode's i_mode is changed.
*/
void
{
/*
* an old DBE hack
*/
else
/*
* if not swap like and it's just a regular file, we want
* for faster sync'ing to disk
*/
else
/*
* Is this an attribute hidden dir?
*/
else
}
/*
* Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate'
* flag is used to distinguish the two; when true, we validate that the inode
* being retrieved looks like a linked and allocated inode.
*/
/* ARGSUSED */
static int
{
int error;
int ftype; /* XXX - Remove later on */
int hno;
/*
* Lookup inode in cache.
*/
continue;
/*
* Found the interesting inode; hold it and drop the cache lock
*/
/*
* if necessary, remove from idle list
*/
if (ufs_rmidle(ip))
}
/*
* Could the inode be read from disk?
*/
goto again;
}
/*
* Reset the vnode's attribute flags
*/
return (0);
}
/*
* Inode was not in cache.
*
* Allocate a new entry
*/
/*
* put a place holder in the cache (if not already there)
*/
goto again;
}
/*
* It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
* here, but if we do, then shadow inode allocations panic the
* system. We don't have to hold vfs_dqrwlock for shadow inodes
* and the ufs_iget() parameters don't tell us what we are getting
* so we have no way of knowing this is a ufs_iget() call from
* a ufs_ialloc() call for a shadow inode.
*/
/*
* read the dinode
*/
/*
* Check I/O errors
*/
if (error) {
return (error);
}
/*
* initialize the inode's dinode
*/
/*
* Maintain compatibility with Solaris 1.x UFS
*/
else
}
/*
* if our caller only expects allocated inodes, verify that
* this inode looks good; throw it out if it's bad.
*/
if (validate) {
"%s: unexpected free inode %d, run fsck(1M)%s",
return (EIO);
}
}
/*
* Finish initializing the vnode, special handling for shadow inodes
* because IFTOVT() will produce a v_type of VNON which is not what we
* want, set v_type to VREG explicitly in that case.
*/
} else {
}
/*
* read the shadow
*/
return (error);
}
}
/*
* Only attach quota information if the inode has a type and if
* that type is not a shadow inode.
*/
}
return (0);
}
/*
* Vnode is no longer referenced, write the inode out
* and if necessary, truncate and deallocate the file.
*/
void
{
int front;
/*
* Because the vnode type might have been changed,
* the dnlc_dir_purge must be called unconditionally.
*/
/*
* Get exclusive access to inode data.
*/
/*
* Make sure no one reclaimed the inode before we put it on
* the freelist or destroy it. We keep our 'hold' on the vnode
* from vn_rele until we are ready to do something with the inode.
*
* operation via an async putpage, so we must make sure
* may also put a VN_HOLD on the inode before it grabs
* the i_contents lock. This is done so we don't free
* an inode that a thread is waiting on.
*/
return;
}
/*
* For umount case: if ufsvfs ptr is NULL, the inode is unhashed
* and clean. It can be safely destroyed (cyf).
*/
ufs_si_del(ip);
return;
}
/*
* queue idle inode to appropriate thread. Will check v_count == 1
* prior to putting this on the appropriate queue.
* Stale inodes will be unhashed and freed by the ufs idle thread
* in ufs_idle_free()
*/
front = 1;
/*
* Mark the i_flag to indicate that inode is being deleted.
* This flag will be cleared when the deletion is complete.
* This prevents nfs from sneaking in via ufs_vget() while
* the delete is in progress (bugid 1242481).
*/
/*
* NOIDEL means that deletes are not allowed at this time;
* whoever resets NOIDEL will also send this inode back
* through ufs_iinactive. IREF remains set.
*/
return;
}
return;
}
/* queue to delete thread; IREF remains set */
/* add to q */
if (front)
} else {
}
} else {
/*
* queue to idle thread
* Check the v_count == 1 again.
*
*/
return;
}
uq = &ufs_idle_q;
/*
* useful iff it has pages or is a fastsymlink; otherwise junk
*/
/* clear IREF means `on idle list' */
} else {
ufs_njunk_iq++;
}
}
/* wakeup thread(s) if q is overfull */
/* all done, release the q and inode */
}
/*
* Check accessed and update flags on an inode structure.
* If any are on, update the inode with the (unique) current time.
* If waitfor is given, insure I/O order so wait for write to complete.
*/
void
{
int i;
int do_trans_times;
/*
* This function is now safe to be called with either the reader
* or writer i_contents lock.
*/
/*
* Return if file system has been forcibly umounted.
*/
return;
/*
* We better not update the disk inode from a stale inode.
*/
return;
return;
}
/*
* fs is active while metadata is being written
*/
/*
* get the dinode
*/
return;
}
/*
* munge inode fields
*/
/*
* For reads and concurrent re-writes, no deltas were
* entered for the access time changes - do it now.
*/
if (do_trans_times) {
}
/*
* For SunOS 5.0->5.4, these lines below read:
*
* suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
* sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
*
* where MAXUID was set to 60002. This was incorrect -
* the uids should have been constrained to what fitted into
* a 16-bit word.
*
* This means that files from 4.x filesystems that have an
* i_suid field larger than 60002 will have that field
* changed to 65535.
*
* Security note: 4.x UFS could never create a i_suid of
* UID_LONG since that would've corresponded to -1.
*/
}
/*
* load first direct block only if special device
*/
/*
* We panic here because there's "no way"
* we should have been able to create a large
* inode with a large dev_t. Earlier layers
* should've caught this.
*/
}
} else {
}
}
/*
* copy inode to dinode (zero fastsymlnk in dinode)
*/
if (flag & IFASTSYMLNK) {
for (i = 1; i < NDADDR; i++)
for (i = 0; i < NIADDR; i++)
}
if (TRANS_ISTRANS(ufsvfsp)) {
/*
* Pass only a sector size buffer containing
* the inode, otherwise when the buffer is copied
* into a cached roll buffer then too much memory
* gets consumed if 8KB inode buffers are passed.
*/
sizeof (struct dinode),
/*
* Synchronous write has guaranteed that inode
* has been written on disk so clear the flag
*/
} else {
/*
* This write hasn't guaranteed that inode has been
* written on the disk.
* Since, all updat flags on inode are cleared, we must
* remember the condition in case inode is to be updated
* synchronously later (e.g.- fsync()/fdatasync())
* and inode has not been modified yet.
*/
}
} else {
/*
* In case previous inode update was done asynchronously
* (IBDWRITE) and this inode update request wants guaranteed
* (synchronous) disk update, flush the inode.
*/
}
}
}
#define SINGLE 0 /* index of single indirect block */
/*
* Release blocks associated with the inode ip and
* stored in the indirect block bn. Blocks are free'd
* in LIFO order up to (but not including) lastbn. If
* level is greater than SINGLE, the block is an indirect
* block and recursive calls to indirtrunc must be used to
* cleanse other indirect blocks.
*
* N.B.: triple indirect blocks are untested.
*/
static long
{
int i;
long factor;
int blocksreleased = 0, nblocks;
/*
* Calculate index in current block of last
* block to be kept. -1 indicates the entire
* block so we need not calculate the index.
*/
factor = 1;
if (lastbn > 0)
/*
* Get buffer of block pointers, zero those
* entries corresponding to blocks to be free'd,
* and update on disk copy first.
* *Unless* the root pointer has been synchronously
* written to disk. If nothing points to this
* indirect block then don't bother zero'ing and
* writing it.
*/
return (0);
}
if (zb) {
/*
* push any data into the log before we zero it
*/
}
} else {
/* make sure write retries are also cleared */
}
/*
* Recursively free totally unused blocks.
*/
if (nb == 0)
continue;
} else
}
/*
* Recursively free last partial block.
*/
if (nb != 0)
}
return (blocksreleased);
}
/*
* Truncate the inode ip to at most length size.
* Free affected disk blocks -- the blocks of the
* file are removed in reverse order.
*
* N.B.: triple indirect blocks are untested.
*/
static int i_genrand = 1234;
int
{
int boff;
int level;
long nblocks, blocksreleased = 0;
int i;
int err;
(UFS_MAXOFFSET_T) : (MAXOFF32_T);
/*
* Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
* other uses need the reader lock. opendq() holds the writer lock.
*/
/*
* We only allow truncation of regular files and directories
* to arbitrary lengths here. In addition, we allow symbolic
* links to be truncated only to zero length. Other inode
* types cannot have their length set here. Disk blocks are
* being dealt with - especially device inodes where
* ip->i_ordev is actually being stored in ip->i_db[0]!
*/
i_genrand++;
return (0);
}
return (0);
return (EINVAL);
return (EFBIG);
/* update ctime and mtime to please POSIX tests */
if (length == 0) {
/* nothing to cache so clear the flag */
}
return (0);
}
/* wipe out fast symlink till next access */
int j;
for (j = 1; j < NDADDR; j++)
for (j = 0; j < NIADDR; j++)
}
/*
* Trunc up case. BMAPALLOC will insure that the right blocks
* are allocated. This includes extending the old frag to a
* full block (if needed) in addition to doing any work
* needed for allocating the last block.
*/
if (boff == 0)
else
if (err == 0) {
/*
* Save old size and set inode's size now
* so that we don't cause too much of the
* file to be zero'd and pushed.
*/
/*
* Make sure we zero out the remaining bytes of
* the page in case a mmap scribbled on it. We
* can't prevent a mmap from writing beyond EOF
* on the last page of a file.
*
*/
}
/*
* MAXOFF32_T is old 2GB size limit. If
* this operation caused a large file to be
* created, turn on the superblock flag
* and update the superblock, if the flag
* is not already on.
*/
}
}
return (err);
}
/*
* Update the pages of the file. If the file is not being
* truncated to a block boundary, the contents of the
* pages following the end of the file must be zero'ed
* in case it ever become accessible again because
* of subsequent file growth.
*/
if (boff == 0) {
} else {
/*
* Make sure that the last block is properly allocated.
* We only really have to do this if the last block is
* actually allocated since ufs_bmap will now handle the case
* of an fragment which has no block allocated. Just to
* be sure, we do it now independent of current allocation.
*/
if (err)
return (err);
/*
* BMAPALLOC will call bmap_write which defers i_seq
* processing. If the timestamps were changed, update
* i_seq before rdip drops i_contents or syncs the inode.
*/
/*
* BugId 4069932
* Make sure that the relevant partial page appears in
* the v_pages list, so that pvn_vpzero() will do its
* job. Since doing this correctly requires everything
* in rdip() except for the uiomove(), it's easier and
* safer to do the uiomove() rather than duplicate the
* rest of rdip() here.
*
* To get here, we know that length indicates a byte
* that is not the first byte of a block. (length - 1)
* is the last actual byte known to exist. Deduction
* shows it is in the same block as byte (length).
* Thus, this rdip() invocation should always succeed
* except in the face of i/o errors, and give us the
* block we care about.
*
* rdip() makes the same locking assertions and
* assumptions as we do. We do not acquire any locks
* before calling it, so we have not changed the locking
* situation. Finally, there do not appear to be any
* paths whereby rdip() ends up invoking us again.
* Thus, infinite recursion is avoided.
*/
{
char buffer;
if (err)
return (err);
}
/*
* Ensure full fs block is marked as dirty.
*/
}
/*
* Calculate index into inode's block list of
* last direct and indirect blocks (if any)
* which we want to keep. Lastblock is -1 when
* the file is truncated to 0.
*/
/*
* Update file and block pointers
* on disk before we start freeing blocks.
* If we crash before free'ing blocks below,
* the blocks will be returned to the free list.
* lastiblock values are also normalized to -1
* for calls to indirtrunc below.
*/
if (lastiblock[level] < 0) {
}
}
if (!TRANS_ISTRANS(ufsvfsp))
/*
* Indirect blocks first.
*/
if (bn != 0) {
if (lastiblock[level] < 0) {
}
}
if (lastiblock[level] >= 0)
goto done;
}
/*
* All whole direct blocks or frags.
*/
if (bn == 0)
continue;
}
if (lastblock < 0)
goto done;
/*
* Finally, look for a change in size of the
* last direct block; release any frags.
*/
if (bn != 0) {
/*
* Calculate amount of space we're giving
* back as old block size minus new block size.
*/
if (newspace == 0) {
return (err);
}
/*
* Block number of space to be free'd is
* the old block # plus the number of frags
* required for the storage we're keeping.
*/
}
}
done:
/* BEGIN PARANOIA */
return (err);
}
for (i = 0; i < NDADDR; i++)
return (err);
}
/* END PARANOIA */
"ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
}
/* blocksreleased is >= zero, so this can not fail */
return (0);
}
/*
* Check mode permission on inode. Mode is READ, WRITE or EXEC.
* In the case of WRITE, the read-only status of the file system
* is checked. Depending on the calling user, the appropriate
* mode bits are selected; privileges to override missing permission
* bits are checked through secpolicy_vnode_access().
*/
int
{
int shift = 0;
/*
* Disallow write attempts on read-only
* file systems, unless the file is a block
* or character device or a FIFO.
*/
return (EROFS);
}
}
}
/*
* If there is a shadow inode check for the presence of an acl,
* if the acl is there use the ufs_acl_access routine to check
* the acl
*/
/*
* Access check is based on only
* one of owner, group, public.
* If not owner, then check group.
* If not a member of the group, then
* check public access.
*/
shift += 3;
shift += 3;
}
if (mode == 0)
return (0);
/* test missing privilege bits */
}
/*
* if necessary, remove an inode from the free list
* i_contents is held except at unmount
*
* Return 1 if the inode is taken off of the ufs_idle_q,
* and the caller is expected to call VN_RELE.
*
* Return 0 otherwise.
*/
int
{
int rval = 0;
ufs_idle_q.uq_ne--;
ufs_njunk_iq--;
} else {
}
rval = 1;
}
return (rval);
}
/*
* scan the hash of inodes and call func with the inode locked
*/
int
{
int error, i;
int saverror = 0;
int lip_held; /* lip needs a VN_RELE() */
/*
* If ufsvfsp is NULL, then our caller should be holding
* ufs_scan_lock to avoid conflicts between ufs_unmount() and
* ufs_update(). Otherwise, to avoid false-positives in
* ufs_unmount()'s v_count-based EBUSY check, we only hold
* those inodes that are in the file system our caller cares
* about.
*
* We know that ip is a valid inode in the hash chain (and thus
* we can trust i_ufsvfs) because the inode we chained from
* (lip) is still in the hash chain. This is true because either:
*
* 1. We did not drop the hash chain lock since the last
* iteration (because we were not interested in the last inode),
* or
* 2. We maintained a hold on the last inode while we
* we were processing it, so it could not be removed
* from the hash chain.
*
* The whole reason we're dropping and re-grabbing the chain
* lock on every inode is so that we don't present a major
* choke point on throughput, particularly when we've been
* called on behalf of fsflush.
*/
mutex_enter(&ih_lock[i]);
/*
* Undo the previous iteration's VN_HOLD(), but
* only if one was done.
*/
if (lip_held)
/*
* We're not processing all inodes, and
* this inode is not in the filesystem of
* interest, so skip it. No need to do a
* VN_HOLD() since we're not dropping the
* hash chain lock until after we've
* done the i_forw traversal above.
*/
lip_held = 0;
continue;
}
lip_held = 1;
mutex_exit(&ih_lock[i]);
/*
* Acquire the contents lock as writer to make
* sure that the inode has been initialized in
* the cache or removed from the idle list by
* ufs_iget(). This works because ufs_iget()
* acquires the contents lock before putting
* the inode into the cache. If we can lock
* it, then he's done with it.
*/
if (rwtry) {
mutex_enter(&ih_lock[i]);
continue;
}
} else {
}
/*
* ISTALE means the inode couldn't be read
*
* We don't have to hold the i_contents lock
* for this check for a couple of
* reasons. First, if ISTALE is set then the
* flag cannot be cleared until the inode is
* removed from the cache and that cannot
* happen until after we VN_RELE() it.
* Second, if ISTALE is not set, then the
* inode is in the cache and does not need to
* be read from disk so ISTALE cannot be set
* while we are not looking.
*/
}
mutex_enter(&ih_lock[i]);
}
if (lip_held)
mutex_exit(&ih_lock[i]);
}
return (saverror);
}
/*
* Mark inode with the current time, plus a unique increment.
*
* Since we only keep 32-bit time on disk, if UFS is still alive
* beyond 2038, filesystem times will simply stick at the last
* possible second of 32-bit time. Not ideal, but probably better
* than going into the remote past, or confusing applications with
* negative time.
*/
void
{
/*
* The update of i_seq may have been deferred, increase i_seq here
* to make sure it is in sync with the timestamps.
*/
}
gethrestime(&now);
/*
* Fast algorithm to convert nsec to usec -- see hrt2ts()
*/
}
} else {
/* Check for usec overflow */
}
}
}
}
}
}
}
/*
* Update timestamps in inode.
*/
void
{
/*
* if noatime is set and the inode access time is the only field that
* must be changed, exit immediately.
*/
return;
}
else
}
}