ufs_inode.c revision 52d549430a19e69ddf8ff7ed6d045200cb07d7f6
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/bitmap.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/vfs.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/dnlc.h>
#include <sys/mode.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/acl.h>
#include <sys/var.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_acl.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_log.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/pvn.h>
#include <vm/seg.h>
#include <sys/swap.h>
#include <sys/cpuvar.h>
#include <sys/sysmacros.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <fs/fs_subr.h>
#include <sys/policy.h>
struct kmem_cache *inode_cache; /* cache of free inodes */
/* UFS Inode Cache Stats -- Not protected */
struct instats ins = {
{ "size", KSTAT_DATA_ULONG },
{ "maxsize", KSTAT_DATA_ULONG },
{ "hits", KSTAT_DATA_ULONG },
{ "misses", KSTAT_DATA_ULONG },
{ "kmem allocs", KSTAT_DATA_ULONG },
{ "kmem frees", KSTAT_DATA_ULONG },
{ "maxsize reached", KSTAT_DATA_ULONG },
{ "puts at frontlist", KSTAT_DATA_ULONG },
{ "puts at backlist", KSTAT_DATA_ULONG },
{ "queues to free", KSTAT_DATA_ULONG },
{ "scans", KSTAT_DATA_ULONG },
{ "thread idles", KSTAT_DATA_ULONG },
{ "lookup idles", KSTAT_DATA_ULONG },
{ "vget idles", KSTAT_DATA_ULONG },
{ "cache allocs", KSTAT_DATA_ULONG },
{ "cache frees", KSTAT_DATA_ULONG },
{ "pushes at close", KSTAT_DATA_ULONG }
};
/* kstat data */
static kstat_t *ufs_inode_kstat = NULL;
union ihead *ihead; /* inode LRU cache, Chris Maltby */
kmutex_t *ih_lock; /* protect inode cache hash table */
static int ino_hashlen = 4; /* desired average hash chain length */
int inohsz; /* number of buckets in the hash table */
kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */
kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */
kmutex_t ufsvfs_mutex;
struct ufsvfs *oldufsvfslist, *ufsvfslist;
/*
* time to wait after ufsvfsp->vfs_iotstamp before declaring that no
* I/Os are going on.
*/
clock_t ufs_iowait;
/*
* the threads that process idle inodes and free (deleted) inodes
* have high water marks that are set in ufsinit().
* These values but can be no less then the minimum shown below
*/
int ufs_idle_max; /* # of allowable idle inodes */
ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */
#define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */
/*
* Tunables for ufs write throttling.
* These are validated in ufs_iinit() since improper settings
* can lead to filesystem hangs.
*/
#define UFS_HW_DEFAULT (16 * 1024 * 1024)
#define UFS_LW_DEFAULT (8 * 1024 * 1024)
int ufs_HW = UFS_HW_DEFAULT;
int ufs_LW = UFS_LW_DEFAULT;
static void ihinit(void);
extern int hash2ints(int, int);
static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
struct cred *, int);
/* ARGSUSED */
static int
ufs_inode_kstat_update(kstat_t *ksp, int rw)
{
if (rw == KSTAT_WRITE)
return (EACCES);
ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
"slab_alloc");
ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
"slab_free");
ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
"alloc");
ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
"free");
ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
"buf_inuse");
ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
"buf_max");
ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
return (0);
}
void
ufs_iinit(void)
{
/*
* Validate that ufs_HW > ufs_LW.
* The default values for these two tunables have been increased.
* There is now a range of values for ufs_HW that used to be
* legal on previous Solaris versions but no longer is now.
* Upgrading a machine which has an /etc/system setting for ufs_HW
* from that range can lead to filesystem hangs unless the values
* are checked here.
*/
if (ufs_HW <= ufs_LW) {
cmn_err(CE_WARN,
"ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
ufs_HW, ufs_LW);
ufs_LW = UFS_LW_DEFAULT;
ufs_HW = UFS_HW_DEFAULT;
cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
ufs_HW, ufs_LW);
}
/*
* Adjust the tunable `ufs_ninode' to a reasonable value
*/
if (ufs_ninode <= 0)
ufs_ninode = ncsize;
if (ufs_inode_max == 0)
ufs_inode_max =
(ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
ufs_inode_max);
ufs_ninode = ufs_inode_max;
}
/*
* Wait till third call of ufs_update to declare that no I/Os are
* going on. This allows deferred access times to be flushed to disk.
*/
ufs_iowait = v.v_autoup * hz * 2;
/*
* idle thread runs when 25% of ufs_ninode entries are on the queue
*/
if (ufs_idle_max == 0)
ufs_idle_max = ufs_ninode >> 2;
if (ufs_idle_max < UFS_IDLE_MAX)
ufs_idle_max = UFS_IDLE_MAX;
if (ufs_idle_max > ufs_ninode)
ufs_idle_max = ufs_ninode;
/*
* This is really a misnomer, it is ufs_queue_init
*/
ufs_thread_init(&ufs_idle_q, ufs_idle_max);
ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
/*
* global hlock thread
*/
ufs_thread_init(&ufs_hlock, 1);
ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
ihinit();
qtinit();
ins.in_maxsize.value.ul = ufs_ninode;
if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL)) != NULL) {
ufs_inode_kstat->ks_data = (void *)&ins;
ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
kstat_install(ufs_inode_kstat);
}
ufsfx_init(); /* fix-on-panic initialization */
si_cache_init();
ufs_directio_init();
lufs_init();
mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
}
/* ARGSUSED */
static int
ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
struct inode *ip = buf;
struct vnode *vp;
rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
dnlc_dir_init(&ip->i_danchor);
cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
vp = vn_alloc(KM_SLEEP);
ip->i_vnode = vp;
vn_setops(vp, ufs_vnodeops);
vp->v_data = (caddr_t)ip;
return (0);
}
/* ARGSUSED */
static void
ufs_inode_cache_destructor(void *buf, void *cdrarg)
{
struct inode *ip = buf;
struct vnode *vp;
vp = ITOV(ip);
rw_destroy(&ip->i_rwlock);
rw_destroy(&ip->i_contents);
mutex_destroy(&ip->i_tlock);
if (vp->v_type == VDIR) {
dnlc_dir_fini(&ip->i_danchor);
}
cv_destroy(&ip->i_wrcv);
vn_free(vp);
}
/*
* Initialize hash links for inodes
* and build inode free list.
*/
void
ihinit(void)
{
int i;
union ihead *ih = ihead;
mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
ih->ih_head[0] = ih;
ih->ih_head[1] = ih;
mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
}
inode_cache = kmem_cache_create("ufs_inode_cache",
sizeof (struct inode), 0, ufs_inode_cache_constructor,
ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
NULL, NULL, 0);
}
/*
* Free an inode structure
*/
void
ufs_free_inode(struct inode *ip)
{
vn_invalid(ITOV(ip));
kmem_cache_free(inode_cache, ip);
}
/*
* Allocate an inode structure
*/
struct inode *
ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
{
struct inode *ip;
vnode_t *vp;
ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
/*
* at this point we have a newly allocated inode
*/
ip->i_freef = ip;
ip->i_freeb = ip;
ip->i_flag = IREF;
ip->i_seq = 0xFF; /* Unique initial value */
ip->i_dev = ufsvfsp->vfs_dev;
ip->i_ufsvfs = ufsvfsp;
ip->i_devvp = ufsvfsp->vfs_devvp;
ip->i_number = ino;
ip->i_diroff = 0;
ip->i_nextr = 0;
ip->i_map = NULL;
ip->i_rdev = 0;
ip->i_writes = 0;
ip->i_mode = 0;
ip->i_delaylen = 0;
ip->i_delayoff = 0;
ip->i_nextrio = 0;
ip->i_ufs_acl = NULL;
ip->i_cflags = 0;
ip->i_mapcnt = 0;
ip->i_dquot = NULL;
ip->i_cachedir = CD_ENABLED;
ip->i_writer = NULL;
/*
* the vnode for this inode was allocated by the constructor
*/
vp = ITOV(ip);
vn_reinit(vp);
if (ino == (ino_t)UFSROOTINO)
vp->v_flag = VROOT;
vp->v_vfsp = ufsvfsp->vfs_vfs;
vn_exists(vp);
return (ip);
}
/*
* Look up an inode by device, inumber. If it is in core (in the
* inode structure), honor the locking protocol. If it is not in
* core, read it in from the specified device after freeing any pages.
* In all cases, a pointer to a VN_HELD inode structure is returned.
*/
int
ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
{
return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
}
/*
* A version of ufs_iget which returns only allocated, linked inodes.
* This is appropriate for any callers who do not expect a free inode.
*/
int
ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
struct cred *cr)
{
return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
}
/*
* Set vnode attributes based on v_type, this should be called whenever
* an inode's i_mode is changed.
*/
void
ufs_reset_vnode(vnode_t *vp)
{
/*
* an old DBE hack
*/
if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
vp->v_flag |= VSWAPLIKE;
else
vp->v_flag &= ~VSWAPLIKE;
/*
* if not swap like and it's just a regular file, we want
* to maintain the vnode's pages sorted by clean/modified
* for faster sync'ing to disk
*/
if (vp->v_type == VREG)
vp->v_flag |= VMODSORT;
else
vp->v_flag &= ~VMODSORT;
/*
* Is this an attribute hidden dir?
*/
if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
vp->v_flag |= V_XATTRDIR;
else
vp->v_flag &= ~V_XATTRDIR;
}
/*
* Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate'
* flag is used to distinguish the two; when true, we validate that the inode
* being retrieved looks like a linked and allocated inode.
*/
/* ARGSUSED */
static int
ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
struct cred *cr, int validate)
{
struct inode *ip, *sp;
union ihead *ih;
kmutex_t *ihm;
struct buf *bp;
struct dinode *dp;
struct vnode *vp;
extern vfs_t EIO_vfs;
int error;
int ftype; /* XXX - Remove later on */
dev_t vfs_dev;
struct ufsvfs *ufsvfsp;
struct fs *fs;
int hno;
daddr_t bno;
ulong_t ioff;
CPU_STATS_ADD_K(sys, ufsiget, 1);
/*
* Lookup inode in cache.
*/
vfs_dev = vfsp->vfs_dev;
hno = INOHASH(ino);
ih = &ihead[hno];
ihm = &ih_lock[hno];
again:
mutex_enter(ihm);
for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
if (ino != ip->i_number || vfs_dev != ip->i_dev ||
(ip->i_flag & ISTALE))
continue;
/*
* Found the interesting inode; hold it and drop the cache lock
*/
vp = ITOV(ip); /* for locknest */
VN_HOLD(vp);
mutex_exit(ihm);
rw_enter(&ip->i_contents, RW_READER);
/*
* if necessary, remove from idle list
*/
if ((ip->i_flag & IREF) == 0) {
if (ufs_rmidle(ip))
VN_RELE(vp);
}
/*
* Could the inode be read from disk?
*/
if (ip->i_flag & ISTALE) {
rw_exit(&ip->i_contents);
VN_RELE(vp);
goto again;
}
ins.in_hits.value.ul++;
*ipp = ip;
/*
* Reset the vnode's attribute flags
*/
mutex_enter(&vp->v_lock);
ufs_reset_vnode(vp);
mutex_exit(&vp->v_lock);
rw_exit(&ip->i_contents);
return (0);
}
mutex_exit(ihm);
/*
* Inode was not in cache.
*
* Allocate a new entry
*/
ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
fs = ufsvfsp->vfs_fs;
ip = ufs_alloc_inode(ufsvfsp, ino);
vp = ITOV(ip);
bno = fsbtodb(fs, itod(fs, ino));
ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
ip->i_doff = (offset_t)ioff + ldbtob(bno);
/*
* put a place holder in the cache (if not already there)
*/
mutex_enter(ihm);
for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
if (ino == sp->i_number && vfs_dev == sp->i_dev &&
((sp->i_flag & ISTALE) == 0)) {
mutex_exit(ihm);
ufs_free_inode(ip);
goto again;
}
/*
* It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
* here, but if we do, then shadow inode allocations panic the
* system. We don't have to hold vfs_dqrwlock for shadow inodes
* and the ufs_iget() parameters don't tell us what we are getting
* so we have no way of knowing this is a ufs_iget() call from
* a ufs_ialloc() call for a shadow inode.
*/
rw_enter(&ip->i_contents, RW_WRITER);
insque(ip, ih);
mutex_exit(ihm);
/*
* read the dinode
*/
bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
/*
* Check I/O errors
*/
error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
if (error) {
brelse(bp);
ip->i_flag |= ISTALE; /* in case someone is looking it up */
rw_exit(&ip->i_contents);
vp->v_vfsp = &EIO_vfs;
VN_RELE(vp);
return (error);
}
/*
* initialize the inode's dinode
*/
dp = (struct dinode *)(ioff + bp->b_un.b_addr);
ip->i_ic = dp->di_ic; /* structure assignment */
brelse(bp);
/*
* Maintain compatibility with Solaris 1.x UFS
*/
if (ip->i_suid != UID_LONG)
ip->i_uid = ip->i_suid;
if (ip->i_sgid != GID_LONG)
ip->i_gid = ip->i_sgid;
ftype = ip->i_mode & IFMT;
if (ftype == IFBLK || ftype == IFCHR) {
dev_t dv;
uint_t top16 = ip->i_ordev & 0xffff0000u;
if (top16 == 0 || top16 == 0xffff0000u)
dv = expdev(ip->i_ordev);
else
dv = expldev(ip->i_ordev);
vp->v_rdev = ip->i_rdev = dv;
}
/*
* if our caller only expects allocated inodes, verify that
* this inode looks good; throw it out if it's bad.
*/
if (validate) {
if ((ftype == 0) || (ip->i_nlink <= 0)) {
ip->i_flag |= ISTALE;
rw_exit(&ip->i_contents);
vp->v_vfsp = &EIO_vfs;
VN_RELE(vp);
cmn_err(CE_NOTE,
"%s: unexpected free inode %d, run fsck(1M)%s",
fs->fs_fsmnt, (int)ino,
(TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
return (EIO);
}
}
/*
* Finish initializing the vnode, special handling for shadow inodes
* because IFTOVT() will produce a v_type of VNON which is not what we
* want, set v_type to VREG explicitly in that case.
*/
if (ftype == IFSHAD) {
vp->v_type = VREG;
} else {
vp->v_type = IFTOVT((mode_t)ip->i_mode);
}
ufs_reset_vnode(vp);
/*
* read the shadow
*/
if (ftype != 0 && ip->i_shadow != 0) {
if ((error = ufs_si_load(ip, cr)) != 0) {
ip->i_flag |= ISTALE;
ip->i_ufs_acl = NULL;
rw_exit(&ip->i_contents);
vp->v_vfsp = &EIO_vfs;
VN_RELE(vp);
return (error);
}
}
/*
* Only attach quota information if the inode has a type and if
* that type is not a shadow inode.
*/
if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
((ip->i_mode & IFMT) != IFATTRDIR)) {
ip->i_dquot = getinoquota(ip);
}
TRANS_MATA_IGET(ufsvfsp, ip);
*ipp = ip;
rw_exit(&ip->i_contents);
return (0);
}
/*
* Vnode is no longer referenced, write the inode out
* and if necessary, truncate and deallocate the file.
*/
void
ufs_iinactive(struct inode *ip)
{
int front;
struct inode *iq;
struct inode *hip;
struct ufs_q *uq;
struct vnode *vp = ITOV(ip);
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
/*
* Because the vnode type might have been changed,
* the dnlc_dir_purge must be called unconditionally.
*/
dnlc_dir_purge(&ip->i_danchor);
/*
* Get exclusive access to inode data.
*/
rw_enter(&ip->i_contents, RW_WRITER);
ASSERT(ip->i_flag & IREF);
/*
* Make sure no one reclaimed the inode before we put it on
* the freelist or destroy it. We keep our 'hold' on the vnode
* from vn_rele until we are ready to do something with the inode.
*
* Pageout may put a VN_HOLD/VN_RELE at anytime during this
* operation via an async putpage, so we must make sure
* we don't free/destroy the inode more than once. ufs_iget
* may also put a VN_HOLD on the inode before it grabs
* the i_contents lock. This is done so we don't free
* an inode that a thread is waiting on.
*/
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--; /* release our hold from vn_rele */
mutex_exit(&vp->v_lock);
rw_exit(&ip->i_contents);
return;
}
mutex_exit(&vp->v_lock);
/*
* For umount case: if ufsvfs ptr is NULL, the inode is unhashed
* and clean. It can be safely destroyed (cyf).
*/
if (ip->i_ufsvfs == NULL) {
rw_exit(&ip->i_contents);
ufs_si_del(ip);
ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
ufs_free_inode(ip);
return;
}
/*
* queue idle inode to appropriate thread. Will check v_count == 1
* prior to putting this on the appropriate queue.
* Stale inodes will be unhashed and freed by the ufs idle thread
* in ufs_idle_free()
*/
front = 1;
if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
ip->i_mode && ip->i_nlink <= 0) {
/*
* Mark the i_flag to indicate that inode is being deleted.
* This flag will be cleared when the deletion is complete.
* This prevents nfs from sneaking in via ufs_vget() while
* the delete is in progress (bugid 1242481).
*/
ip->i_flag |= IDEL;
/*
* NOIDEL means that deletes are not allowed at this time;
* whoever resets NOIDEL will also send this inode back
* through ufs_iinactive. IREF remains set.
*/
if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
mutex_enter(&vp->v_lock);
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&ip->i_contents);
return;
}
if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
rw_exit(&ip->i_contents);
ufs_delete(ip->i_ufsvfs, ip, 0);
return;
}
/* queue to delete thread; IREF remains set */
ins.in_qfree.value.ul++;
uq = &ip->i_ufsvfs->vfs_delete;
mutex_enter(&uq->uq_mutex);
/* add to q */
if ((iq = uq->uq_ihead) != 0) {
ip->i_freef = iq;
ip->i_freeb = iq->i_freeb;
iq->i_freeb->i_freef = ip;
iq->i_freeb = ip;
if (front)
uq->uq_ihead = ip;
} else {
uq->uq_ihead = ip;
ip->i_freef = ip;
ip->i_freeb = ip;
}
delq_info->delq_unreclaimed_files += 1;
delq_info->delq_unreclaimed_blocks += ip->i_blocks;
} else {
/*
* queue to idle thread
* Check the v_count == 1 again.
*
*/
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--; /* release our hold from vn_rele */
mutex_exit(&vp->v_lock);
rw_exit(&ip->i_contents);
return;
}
mutex_exit(&vp->v_lock);
uq = &ufs_idle_q;
/*
* useful iff it has pages or is a fastsymlink; otherwise junk
*/
mutex_enter(&uq->uq_mutex);
/* clear IREF means `on idle list' */
ip->i_flag &= ~(IREF | IDIRECTIO);
if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
ins.in_frback.value.ul++;
hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
ufs_nuseful_iq++;
} else {
ins.in_frfront.value.ul++;
hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
ip->i_flag |= IJUNKIQ;
ufs_njunk_iq++;
}
ip->i_freef = hip;
ip->i_freeb = hip->i_freeb;
hip->i_freeb->i_freef = ip;
hip->i_freeb = ip;
}
/* wakeup thread(s) if q is overfull */
if (++uq->uq_ne == uq->uq_lowat)
cv_broadcast(&uq->uq_cv);
/* all done, release the q and inode */
mutex_exit(&uq->uq_mutex);
rw_exit(&ip->i_contents);
}
/*
* Check accessed and update flags on an inode structure.
* If any are on, update the inode with the (unique) current time.
* If waitfor is given, insure I/O order so wait for write to complete.
*/
void
ufs_iupdat(struct inode *ip, int waitfor)
{
struct buf *bp;
struct fs *fp;
struct dinode *dp;
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
int i;
int do_trans_times;
ushort_t flag;
o_uid_t suid;
o_gid_t sgid;
/*
* This function is now safe to be called with either the reader
* or writer i_contents lock.
*/
ASSERT(RW_LOCK_HELD(&ip->i_contents));
/*
* Return if file system has been forcibly umounted.
*/
if (ufsvfsp == NULL)
return;
flag = ip->i_flag; /* Atomic read */
/*
* We better not update the disk inode from a stale inode.
*/
if (flag & ISTALE)
return;
fp = ip->i_fs;
if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
if (fp->fs_ronly) {
mutex_enter(&ip->i_tlock);
ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
mutex_exit(&ip->i_tlock);
return;
}
/*
* fs is active while metadata is being written
*/
mutex_enter(&ufsvfsp->vfs_lock);
ufs_notclean(ufsvfsp);
/*
* get the dinode
*/
bp = UFS_BREAD(ufsvfsp, ip->i_dev,
(daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
(int)fp->fs_bsize);
if (bp->b_flags & B_ERROR) {
mutex_enter(&ip->i_tlock);
ip->i_flag &=
~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
mutex_exit(&ip->i_tlock);
brelse(bp);
return;
}
/*
* munge inode fields
*/
mutex_enter(&ip->i_tlock);
ITIMES_NOLOCK(ip);
do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
mutex_exit(&ip->i_tlock);
/*
* For reads and concurrent re-writes, no deltas were
* entered for the access time changes - do it now.
*/
if (do_trans_times) {
TRANS_INODE_TIMES(ufsvfsp, ip);
}
/*
* For SunOS 5.0->5.4, these lines below read:
*
* suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
* sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
*
* where MAXUID was set to 60002. This was incorrect -
* the uids should have been constrained to what fitted into
* a 16-bit word.
*
* This means that files from 4.x filesystems that have an
* i_suid field larger than 60002 will have that field
* changed to 65535.
*
* Security note: 4.x UFS could never create a i_suid of
* UID_LONG since that would've corresponded to -1.
*/
suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
UID_LONG : ip->i_uid;
sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
GID_LONG : ip->i_gid;
if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
ip->i_suid = suid;
ip->i_sgid = sgid;
TRANS_INODE(ufsvfsp, ip);
}
if ((ip->i_mode & IFMT) == IFBLK ||
(ip->i_mode & IFMT) == IFCHR) {
dev_t d = ip->i_rdev;
dev32_t dev32;
/*
* load first direct block only if special device
*/
if (!cmpldev(&dev32, d)) {
/*
* We panic here because there's "no way"
* we should have been able to create a large
* inode with a large dev_t. Earlier layers
* should've caught this.
*/
panic("ip %p: i_rdev too big", (void *)ip);
}
if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
ip->i_ordev = dev32; /* can't use old fmt. */
} else {
ip->i_ordev = cmpdev(d);
}
}
/*
* copy inode to dinode (zero fastsymlnk in dinode)
*/
dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
dp->di_ic = ip->i_ic; /* structure assignment */
if (flag & IFASTSYMLNK) {
for (i = 1; i < NDADDR; i++)
dp->di_db[i] = 0;
for (i = 0; i < NIADDR; i++)
dp->di_ib[i] = 0;
}
if (TRANS_ISTRANS(ufsvfsp)) {
/*
* Pass only a sector size buffer containing
* the inode, otherwise when the buffer is copied
* into a cached roll buffer then too much memory
* gets consumed if 8KB inode buffers are passed.
*/
TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
sizeof (struct dinode),
(caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
DEV_BSIZE);
brelse(bp);
} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
UFS_BRWRITE(ufsvfsp, bp);
/*
* Synchronous write has guaranteed that inode
* has been written on disk so clear the flag
*/
mutex_enter(&ip->i_tlock);
ip->i_flag &= ~IBDWRITE;
mutex_exit(&ip->i_tlock);
} else {
bdrwrite(bp);
/*
* This write hasn't guaranteed that inode has been
* written on the disk.
* Since, all updat flags on inode are cleared, we must
* remember the condition in case inode is to be updated
* synchronously later (e.g.- fsync()/fdatasync())
* and inode has not been modified yet.
*/
mutex_enter(&ip->i_tlock);
ip->i_flag |= IBDWRITE;
mutex_exit(&ip->i_tlock);
}
} else {
/*
* In case previous inode update was done asynchronously
* (IBDWRITE) and this inode update request wants guaranteed
* (synchronous) disk update, flush the inode.
*/
if (waitfor && (flag & IBDWRITE)) {
blkflush(ip->i_dev,
(daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
mutex_enter(&ip->i_tlock);
ip->i_flag &= ~IBDWRITE;
mutex_exit(&ip->i_tlock);
}
}
}
#define SINGLE 0 /* index of single indirect block */
#define DOUBLE 1 /* index of double indirect block */
#define TRIPLE 2 /* index of triple indirect block */
/*
* Release blocks associated with the inode ip and
* stored in the indirect block bn. Blocks are free'd
* in LIFO order up to (but not including) lastbn. If
* level is greater than SINGLE, the block is an indirect
* block and recursive calls to indirtrunc must be used to
* cleanse other indirect blocks.
*
* N.B.: triple indirect blocks are untested.
*/
static long
indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
{
int i;
struct buf *bp, *copy;
daddr32_t *bap;
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
struct fs *fs = ufsvfsp->vfs_fs;
daddr_t nb, last;
long factor;
int blocksreleased = 0, nblocks;
ASSERT(RW_WRITE_HELD(&ip->i_contents));
/*
* Calculate index in current block of last
* block to be kept. -1 indicates the entire
* block so we need not calculate the index.
*/
factor = 1;
for (i = SINGLE; i < level; i++)
factor *= NINDIR(fs);
last = lastbn;
if (lastbn > 0)
last /= factor;
nblocks = btodb(fs->fs_bsize);
/*
* Get buffer of block pointers, zero those
* entries corresponding to blocks to be free'd,
* and update on disk copy first.
* *Unless* the root pointer has been synchronously
* written to disk. If nothing points to this
* indirect block then don't bother zero'ing and
* writing it.
*/
bp = UFS_BREAD(ufsvfsp,
ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
if (bp->b_flags & B_ERROR) {
brelse(bp);
return (0);
}
bap = bp->b_un.b_daddr;
if ((flags & I_CHEAP) == 0) {
uint_t zb;
zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
if (zb) {
/*
* push any data into the log before we zero it
*/
if (bp->b_flags & B_DELWRI)
TRANS_LOG(ufsvfsp, (caddr_t)bap,
ldbtob(bp->b_blkno), bp->b_bcount,
bp->b_un.b_addr, bp->b_bcount);
copy = ngeteblk(fs->fs_bsize);
bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
(uint_t)fs->fs_bsize);
bzero((caddr_t)&bap[last + 1], zb);
TRANS_BUF(ufsvfsp,
(caddr_t)&bap[last + 1] - (caddr_t)bap,
zb, bp, DT_ABZERO);
UFS_BRWRITE(ufsvfsp, bp);
bp = copy, bap = bp->b_un.b_daddr;
}
} else {
/* make sure write retries are also cleared */
bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
bp->b_flags |= B_STALE | B_AGE;
}
/*
* Recursively free totally unused blocks.
*/
flags |= I_CHEAP;
for (i = NINDIR(fs) - 1; i > last; i--) {
nb = bap[i];
if (nb == 0)
continue;
if (level > SINGLE) {
blocksreleased +=
indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
} else
free(ip, nb, (off_t)fs->fs_bsize, flags);
blocksreleased += nblocks;
}
flags &= ~I_CHEAP;
/*
* Recursively free last partial block.
*/
if (level > SINGLE && lastbn >= 0) {
last = lastbn % factor;
nb = bap[i];
if (nb != 0)
blocksreleased +=
indirtrunc(ip, nb, last, level - 1, flags);
}
brelse(bp);
return (blocksreleased);
}
/*
* Truncate the inode ip to at most length size.
* Free affected disk blocks -- the blocks of the
* file are removed in reverse order.
*
* N.B.: triple indirect blocks are untested.
*/
static int i_genrand = 1234;
int
ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
{
struct fs *fs = oip->i_fs;
struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
struct inode *ip;
daddr_t lastblock;
off_t bsize;
int boff;
daddr_t bn, lastiblock[NIADDR];
int level;
long nblocks, blocksreleased = 0;
int i;
ushort_t mode;
struct inode tip;
int err;
u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
(UFS_MAXOFFSET_T) : (MAXOFF32_T);
/*
* Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
* other uses need the reader lock. opendq() holds the writer lock.
*/
ASSERT((oip->i_mode & IFMT) == IFSHAD ||
RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
ASSERT(RW_WRITE_HELD(&oip->i_contents));
/*
* We only allow truncation of regular files and directories
* to arbitrary lengths here. In addition, we allow symbolic
* links to be truncated only to zero length. Other inode
* types cannot have their length set here. Disk blocks are
* being dealt with - especially device inodes where
* ip->i_ordev is actually being stored in ip->i_db[0]!
*/
TRANS_INODE(ufsvfsp, oip);
mode = oip->i_mode & IFMT;
if (flags & I_FREE) {
i_genrand *= 16843009; /* turns into shift and adds */
i_genrand++;
oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1;
oip->i_flag |= ICHG |IUPD;
oip->i_seq++;
if (length == oip->i_size)
return (0);
flags |= I_CHEAP;
}
if (mode == IFIFO)
return (0);
if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
!(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
return (EINVAL);
if (length > maxoffset)
return (EFBIG);
if ((mode == IFDIR) || (mode == IFATTRDIR))
flags |= I_DIR;
if (mode == IFSHAD)
flags |= I_SHAD;
if (oip == ufsvfsp->vfs_qinod)
flags |= I_QUOTA;
if (length == oip->i_size) {
/* update ctime and mtime to please POSIX tests */
oip->i_flag |= ICHG |IUPD;
oip->i_seq++;
if (length == 0) {
/* nothing to cache so clear the flag */
oip->i_flag &= ~IFASTSYMLNK;
}
return (0);
}
/* wipe out fast symlink till next access */
if (oip->i_flag & IFASTSYMLNK) {
int j;
ASSERT(ITOV(oip)->v_type == VLNK);
oip->i_flag &= ~IFASTSYMLNK;
for (j = 1; j < NDADDR; j++)
oip->i_db[j] = 0;
for (j = 0; j < NIADDR; j++)
oip->i_ib[j] = 0;
}
boff = (int)blkoff(fs, length);
if (length > oip->i_size) {
/*
* Trunc up case. BMAPALLOC will insure that the right blocks
* are allocated. This includes extending the old frag to a
* full block (if needed) in addition to doing any work
* needed for allocating the last block.
*/
if (boff == 0)
err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
else
err = BMAPALLOC(oip, length - 1, boff, cr);
if (err == 0) {
/*
* Save old size and set inode's size now
* so that we don't cause too much of the
* file to be zero'd and pushed.
*/
u_offset_t osize = oip->i_size;
oip->i_size = length;
/*
* Make sure we zero out the remaining bytes of
* the page in case a mmap scribbled on it. We
* can't prevent a mmap from writing beyond EOF
* on the last page of a file.
*
*/
if ((boff = (int)blkoff(fs, osize)) != 0) {
bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
fs->fs_bsize : fragroundup(fs, boff);
pvn_vpzero(ITOV(oip), osize,
(size_t)(bsize - boff));
}
oip->i_flag |= ICHG|IATTCHG;
oip->i_seq++;
ITIMES_NOLOCK(oip);
/*
* MAXOFF32_T is old 2GB size limit. If
* this operation caused a large file to be
* created, turn on the superblock flag
* and update the superblock, if the flag
* is not already on.
*/
if ((length > (u_offset_t)MAXOFF32_T) &&
!(fs->fs_flags & FSLARGEFILES)) {
ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
mutex_enter(&ufsvfsp->vfs_lock);
fs->fs_flags |= FSLARGEFILES;
ufs_sbwrite(ufsvfsp);
mutex_exit(&ufsvfsp->vfs_lock);
}
}
return (err);
}
/*
* Update the pages of the file. If the file is not being
* truncated to a block boundary, the contents of the
* pages following the end of the file must be zero'ed
* in case it ever become accessible again because
* of subsequent file growth.
*/
if (boff == 0) {
(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
B_INVAL | B_TRUNC, CRED());
} else {
/*
* Make sure that the last block is properly allocated.
* We only really have to do this if the last block is
* actually allocated since ufs_bmap will now handle the case
* of an fragment which has no block allocated. Just to
* be sure, we do it now independent of current allocation.
*/
err = BMAPALLOC(oip, length - 1, boff, cr);
if (err)
return (err);
/*
* BMAPALLOC will call bmap_write which defers i_seq
* processing. If the timestamps were changed, update
* i_seq before rdip drops i_contents or syncs the inode.
*/
if (oip->i_flag & (ICHG|IUPD))
oip->i_seq++;
/*
* BugId 4069932
* Make sure that the relevant partial page appears in
* the v_pages list, so that pvn_vpzero() will do its
* job. Since doing this correctly requires everything
* in rdip() except for the uiomove(), it's easier and
* safer to do the uiomove() rather than duplicate the
* rest of rdip() here.
*
* To get here, we know that length indicates a byte
* that is not the first byte of a block. (length - 1)
* is the last actual byte known to exist. Deduction
* shows it is in the same block as byte (length).
* Thus, this rdip() invocation should always succeed
* except in the face of i/o errors, and give us the
* block we care about.
*
* rdip() makes the same locking assertions and
* assumptions as we do. We do not acquire any locks
* before calling it, so we have not changed the locking
* situation. Finally, there do not appear to be any
* paths whereby rdip() ends up invoking us again.
* Thus, infinite recursion is avoided.
*/
{
uio_t uio;
iovec_t iov[1];
char buffer;
uio.uio_iov = iov;
uio.uio_iovcnt = 1;
uio.uio_loffset = length - 1;
uio.uio_resid = 1;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_extflg = UIO_COPY_CACHED;
iov[0].iov_base = &buffer;
iov[0].iov_len = 1;
err = rdip(oip, &uio, UIO_READ, NULL);
if (err)
return (err);
}
bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
fs->fs_bsize : fragroundup(fs, boff);
pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
/*
* Ensure full fs block is marked as dirty.
*/
(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
ufs_putapage, B_INVAL | B_TRUNC, CRED());
}
/*
* Calculate index into inode's block list of
* last direct and indirect blocks (if any)
* which we want to keep. Lastblock is -1 when
* the file is truncated to 0.
*/
lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
lastiblock[SINGLE] = lastblock - NDADDR;
lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
nblocks = btodb(fs->fs_bsize);
/*
* Update file and block pointers
* on disk before we start freeing blocks.
* If we crash before free'ing blocks below,
* the blocks will be returned to the free list.
* lastiblock values are also normalized to -1
* for calls to indirtrunc below.
*/
tip = *oip; /* structure copy */
ip = &tip;
for (level = TRIPLE; level >= SINGLE; level--)
if (lastiblock[level] < 0) {
oip->i_ib[level] = 0;
lastiblock[level] = -1;
}
for (i = NDADDR - 1; i > lastblock; i--) {
oip->i_db[i] = 0;
flags |= I_CHEAP;
}
oip->i_size = length;
oip->i_flag |= ICHG|IUPD|IATTCHG;
oip->i_seq++;
if (!TRANS_ISTRANS(ufsvfsp))
ufs_iupdat(oip, I_SYNC); /* do sync inode update */
/*
* Indirect blocks first.
*/
for (level = TRIPLE; level >= SINGLE; level--) {
bn = ip->i_ib[level];
if (bn != 0) {
blocksreleased +=
indirtrunc(ip, bn, lastiblock[level], level, flags);
if (lastiblock[level] < 0) {
ip->i_ib[level] = 0;
free(ip, bn, (off_t)fs->fs_bsize,
flags | I_IBLK);
blocksreleased += nblocks;
}
}
if (lastiblock[level] >= 0)
goto done;
}
/*
* All whole direct blocks or frags.
*/
for (i = NDADDR - 1; i > lastblock; i--) {
bn = ip->i_db[i];
if (bn == 0)
continue;
ip->i_db[i] = 0;
bsize = (off_t)blksize(fs, ip, i);
free(ip, bn, bsize, flags);
blocksreleased += btodb(bsize);
}
if (lastblock < 0)
goto done;
/*
* Finally, look for a change in size of the
* last direct block; release any frags.
*/
bn = ip->i_db[lastblock];
if (bn != 0) {
off_t oldspace, newspace;
/*
* Calculate amount of space we're giving
* back as old block size minus new block size.
*/
oldspace = blksize(fs, ip, lastblock);
UFS_SET_ISIZE(length, ip);
newspace = blksize(fs, ip, lastblock);
if (newspace == 0) {
err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
return (err);
}
if (oldspace - newspace > 0) {
/*
* Block number of space to be free'd is
* the old block # plus the number of frags
* required for the storage we're keeping.
*/
bn += numfrags(fs, newspace);
free(ip, bn, oldspace - newspace, flags);
blocksreleased += btodb(oldspace - newspace);
}
}
done:
/* BEGIN PARANOIA */
for (level = SINGLE; level <= TRIPLE; level++)
if (ip->i_ib[level] != oip->i_ib[level]) {
err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
return (err);
}
for (i = 0; i < NDADDR; i++)
if (ip->i_db[i] != oip->i_db[i]) {
err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
return (err);
}
/* END PARANOIA */
oip->i_blocks -= blocksreleased;
if (oip->i_blocks < 0) { /* sanity */
cmn_err(CE_NOTE,
"ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
(int)oip->i_blocks);
oip->i_blocks = 0;
}
oip->i_flag |= ICHG|IATTCHG;
oip->i_seq++;
/* blocksreleased is >= zero, so this can not fail */
(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
(size_t *)NULL);
return (0);
}
/*
* Check mode permission on inode. Mode is READ, WRITE or EXEC.
* In the case of WRITE, the read-only status of the file system
* is checked. Depending on the calling user, the appropriate
* mode bits are selected; privileges to override missing permission
* bits are checked through secpolicy_vnode_access().
*/
int
ufs_iaccess(void *vip, int mode, struct cred *cr)
{
struct inode *ip = vip;
int shift = 0;
if (mode & IWRITE) {
/*
* Disallow write attempts on read-only
* file systems, unless the file is a block
* or character device or a FIFO.
*/
if (ip->i_fs->fs_ronly != 0) {
if ((ip->i_mode & IFMT) != IFCHR &&
(ip->i_mode & IFMT) != IFBLK &&
(ip->i_mode & IFMT) != IFIFO) {
return (EROFS);
}
}
}
/*
* If there is a shadow inode check for the presence of an acl,
* if the acl is there use the ufs_acl_access routine to check
* the acl
*/
if (ip->i_ufs_acl && ip->i_ufs_acl->aowner)
return (ufs_acl_access(ip, mode, cr));
/*
* Access check is based on only
* one of owner, group, public.
* If not owner, then check group.
* If not a member of the group, then
* check public access.
*/
if (crgetuid(cr) != ip->i_uid) {
shift += 3;
if (!groupmember((uid_t)ip->i_gid, cr))
shift += 3;
}
mode &= ~(ip->i_mode << shift);
if (mode == 0)
return (0);
/* test missing privilege bits */
return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode));
}
/*
* if necessary, remove an inode from the free list
* i_contents is held except at unmount
*
* Return 1 if the inode is taken off of the ufs_idle_q,
* and the caller is expected to call VN_RELE.
*
* Return 0 otherwise.
*/
int
ufs_rmidle(struct inode *ip)
{
int rval = 0;
mutex_enter(&ip->i_tlock);
if ((ip->i_flag & IREF) == 0) {
mutex_enter(&ufs_idle_q.uq_mutex);
ip->i_freef->i_freeb = ip->i_freeb;
ip->i_freeb->i_freef = ip->i_freef;
ip->i_freef = ip;
ip->i_freeb = ip;
ip->i_flag |= IREF;
ufs_idle_q.uq_ne--;
if (ip->i_flag & IJUNKIQ) {
ufs_njunk_iq--;
ip->i_flag &= ~IJUNKIQ;
} else {
ufs_nuseful_iq--;
}
mutex_exit(&ufs_idle_q.uq_mutex);
rval = 1;
}
mutex_exit(&ip->i_tlock);
return (rval);
}
/*
* scan the hash of inodes and call func with the inode locked
*/
int
ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
struct ufsvfs *ufsvfsp)
{
struct inode *ip; /* current inode */
struct inode *lip = NULL; /* last/previous inode */
union ihead *ih; /* current hash chain */
int error, i;
int saverror = 0;
int lip_held; /* lip needs a VN_RELE() */
/*
* If ufsvfsp is NULL, then our caller should be holding
* ufs_scan_lock to avoid conflicts between ufs_unmount() and
* ufs_update(). Otherwise, to avoid false-positives in
* ufs_unmount()'s v_count-based EBUSY check, we only hold
* those inodes that are in the file system our caller cares
* about.
*
* We know that ip is a valid inode in the hash chain (and thus
* we can trust i_ufsvfs) because the inode we chained from
* (lip) is still in the hash chain. This is true because either:
*
* 1. We did not drop the hash chain lock since the last
* iteration (because we were not interested in the last inode),
* or
* 2. We maintained a hold on the last inode while we
* we were processing it, so it could not be removed
* from the hash chain.
*
* The whole reason we're dropping and re-grabbing the chain
* lock on every inode is so that we don't present a major
* choke point on throughput, particularly when we've been
* called on behalf of fsflush.
*/
for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
mutex_enter(&ih_lock[i]);
for (ip = ih->ih_chain[0], lip_held = 0;
ip != (struct inode *)ih;
ip = lip->i_forw) {
ins.in_scan.value.ul++;
/*
* Undo the previous iteration's VN_HOLD(), but
* only if one was done.
*/
if (lip_held)
VN_RELE(ITOV(lip));
lip = ip;
if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
/*
* We're not processing all inodes, and
* this inode is not in the filesystem of
* interest, so skip it. No need to do a
* VN_HOLD() since we're not dropping the
* hash chain lock until after we've
* done the i_forw traversal above.
*/
lip_held = 0;
continue;
}
VN_HOLD(ITOV(ip));
lip_held = 1;
mutex_exit(&ih_lock[i]);
/*
* Acquire the contents lock as writer to make
* sure that the inode has been initialized in
* the cache or removed from the idle list by
* ufs_iget(). This works because ufs_iget()
* acquires the contents lock before putting
* the inode into the cache. If we can lock
* it, then he's done with it.
*/
if (rwtry) {
if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
mutex_enter(&ih_lock[i]);
continue;
}
} else {
rw_enter(&ip->i_contents, RW_WRITER);
}
rw_exit(&ip->i_contents);
/*
* ISTALE means the inode couldn't be read
*
* We don't have to hold the i_contents lock
* for this check for a couple of
* reasons. First, if ISTALE is set then the
* flag cannot be cleared until the inode is
* removed from the cache and that cannot
* happen until after we VN_RELE() it.
* Second, if ISTALE is not set, then the
* inode is in the cache and does not need to
* be read from disk so ISTALE cannot be set
* while we are not looking.
*/
if ((ip->i_flag & ISTALE) == 0) {
if ((error = (*func)(ip, arg)) != 0)
saverror = error;
}
mutex_enter(&ih_lock[i]);
}
if (lip_held)
VN_RELE(ITOV(lip));
mutex_exit(&ih_lock[i]);
}
return (saverror);
}
/*
* Mark inode with the current time, plus a unique increment.
*
* Since we only keep 32-bit time on disk, if UFS is still alive
* beyond 2038, filesystem times will simply stick at the last
* possible second of 32-bit time. Not ideal, but probably better
* than going into the remote past, or confusing applications with
* negative time.
*/
void
ufs_imark(struct inode *ip)
{
timestruc_t now;
int32_t usec, nsec;
/*
* The update of i_seq may have been deferred, increase i_seq here
* to make sure it is in sync with the timestamps.
*/
if (ip->i_flag & ISEQ) {
ASSERT(ip->i_flag & (IUPD|ICHG));
ip->i_seq++;
ip->i_flag &= ~ISEQ;
}
gethrestime(&now);
/*
* Fast algorithm to convert nsec to usec -- see hrt2ts()
* in common/os/timers.c for a full description.
*/
nsec = now.tv_nsec;
usec = nsec + (nsec >> 2);
usec = nsec + (usec >> 1);
usec = nsec + (usec >> 2);
usec = nsec + (usec >> 4);
usec = nsec - (usec >> 3);
usec = nsec + (usec >> 2);
usec = nsec + (usec >> 3);
usec = nsec + (usec >> 4);
usec = nsec + (usec >> 1);
usec = nsec + (usec >> 6);
usec = usec >> 10;
mutex_enter(&ufs_iuniqtime_lock);
if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
usec > iuniqtime.tv_usec) {
if (now.tv_sec < TIME32_MAX) {
iuniqtime.tv_sec = (time32_t)now.tv_sec;
iuniqtime.tv_usec = usec;
}
} else {
if (iuniqtime.tv_sec < TIME32_MAX) {
iuniqtime.tv_usec++;
/* Check for usec overflow */
if (iuniqtime.tv_usec >= MICROSEC) {
iuniqtime.tv_sec++;
iuniqtime.tv_usec = 0;
}
}
}
if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
ip->i_atime = iuniqtime;
}
if (ip->i_flag & IUPD) {
ip->i_mtime = iuniqtime;
ip->i_flag |= IMODTIME;
}
if (ip->i_flag & ICHG) {
ip->i_diroff = 0;
ip->i_ctime = iuniqtime;
}
mutex_exit(&ufs_iuniqtime_lock);
}
/*
* Update timestamps in inode.
*/
void
ufs_itimes_nolock(struct inode *ip)
{
/*
* if noatime is set and the inode access time is the only field that
* must be changed, exit immediately.
*/
if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
(ip->i_ufsvfs->vfs_noatime)) {
return;
}
if (ip->i_flag & (IUPD|IACC|ICHG)) {
if (ip->i_flag & ICHG)
ip->i_flag |= IMOD;
else
ip->i_flag |= IMODACC;
ufs_imark(ip);
ip->i_flag &= ~(IACC|IUPD|ICHG);
}
}