fs/ufs/ufs_lockfs.c

	ufs_lockfs.c revision 13237b7e1e5bd293e466307b2e06f8e0e2321a0a
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/mman.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmmeter.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/acct.h>
#include <sys/dnlc.h>
#include <sys/swap.h>

#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_panic.h>
#include <sys/fs/ufs_mount.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_log.h>
#include <sys/fs/ufs_quota.h>
#include <sys/dirent.h>     /* must be AFTER <sys/fs/fsdir.h>! */
#include <sys/errno.h>
#include <sys/sysinfo.h>

#include <vm/hat.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/anon.h>
#include <sys/swap.h>
#include <sys/dnlc.h>

extern struct vnode *common_specvp(struct vnode *vp);

/* error lock status */
#define UN_ERRLCK   (-1)
#define SET_ERRLCK  1
#define RE_ERRLCK   2
#define NO_ERRLCK   0

/*
 * Index to be used in TSD for storing lockfs data
 */
uint_t ufs_lockfs_key;

typedef struct _ulockfs_info {
    struct _ulockfs_info *next;
    struct ulockfs *ulp;
    uint_t flags;
} ulockfs_info_t;

#define ULOCK_INFO_FALLOCATE    0x00000001  /* fallocate thread */

/*
 * Check in TSD that whether we are already doing any VOP on this filesystem
 */
#define IS_REC_VOP(found, head, ulp, free)      \
{                           \
    ulockfs_info_t *_curr;              \
                            \
    for (found = 0, free = NULL, _curr = head;  \
        _curr != NULL; _curr = _curr->next) {   \
        if ((free == NULL) &&           \
            (_curr->ulp == NULL))       \
            free = _curr;           \
        if (_curr->ulp == ulp) {        \
            found = 1;          \
            break;              \
        }                   \
    }                       \
}

/*
 * Get the lockfs data from TSD so that lockfs handles the recursive VOP
 * properly
 */
#define SEARCH_ULOCKFSP(head, ulp, info)        \
{                           \
    ulockfs_info_t *_curr;              \
                            \
    for (_curr = head; _curr != NULL;       \
        _curr = _curr->next) {          \
        if (_curr->ulp == ulp) {        \
            break;              \
        }                   \
    }                       \
                            \
    info = _curr;                   \
}

/*
 * Validate lockfs request
 */
static int
ufs_getlfd(
    struct lockfs *lockfsp,     /* new lock request */
    struct lockfs *ul_lockfsp)  /* old lock state */
{
    int error = 0;

    /*
     * no input flags defined
     */
    if (lockfsp->lf_flags != 0) {
        error = EINVAL;
        goto errout;
    }

    /*
     * check key
     */
    if (!LOCKFS_IS_ULOCK(ul_lockfsp))
        if (lockfsp->lf_key != ul_lockfsp->lf_key) {
            error = EINVAL;
            goto errout;
    }

    lockfsp->lf_key = ul_lockfsp->lf_key + 1;

errout:
    return (error);
}

/*
 * ufs_checkaccton
 *  check if accounting is turned on on this fs
 */

int
ufs_checkaccton(struct vnode *vp)
{
    if (acct_fs_in_use(vp))
        return (EDEADLK);
    return (0);
}

/*
 * ufs_checkswapon
 *  check if local swapping is to file on this fs
 */
int
ufs_checkswapon(struct vnode *vp)
{
    struct swapinfo *sip;

    mutex_enter(&swapinfo_lock);
    for (sip = swapinfo; sip; sip = sip->si_next)
        if (sip->si_vp->v_vfsp == vp->v_vfsp) {
            mutex_exit(&swapinfo_lock);
            return (EDEADLK);
        }
    mutex_exit(&swapinfo_lock);
    return (0);
}

/*
 * ufs_freeze
 *  pend future accesses for current lock and desired lock
 */
void
ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
{
    /*
     * set to new lock type
     */
    ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
    ulp->ul_lockfs.lf_key = lockfsp->lf_key;
    ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
    ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;

    ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
}

/*
 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
 * starting ufs_quiesce() protocol and decrement it only when a file system no
 * longer has to be in quiescent state. This allows ufs_pageio() to detect
 * that another thread wants to quiesce a file system. See more comments in
 * ufs_pageio().
 */
ulong_t ufs_quiesce_pend = 0;

/*
 * ufs_quiesce
 *  wait for outstanding accesses to finish
 */
int
ufs_quiesce(struct ulockfs *ulp)
{
    int error = 0;
    ulockfs_info_t *head;
    ulockfs_info_t *info;

    head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
    SEARCH_ULOCKFSP(head, ulp, info);

    /*
     * Set a softlock to suspend future ufs_vnops so that
     * this lockfs request will not be starved
     */
    ULOCKFS_SET_SLOCK(ulp);
    ASSERT(ufs_quiesce_pend);

    /* check if there is any outstanding ufs vnodeops calls */
    while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
        /*
         * use timed version of cv_wait_sig() to make sure we don't
         * miss a wake up call from ufs_pageio() when it doesn't use
         * ul_lock.
         *
         * when a fallocate thread comes in, the only way it returns
         * from this function is if there are no other vnode operations
         * going on (remember fallocate threads are tracked using
         * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
         * hasn't already grabbed the fs write lock.
         */
        if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
            if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
                goto out;
        }
        if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
            error = EINTR;
            goto out;
        }
    }

out:
    /*
     * unlock the soft lock
     */
    ULOCKFS_CLR_SLOCK(ulp);

    return (error);
}

/*
 * ufs_flush_inode
 */
int
ufs_flush_inode(struct inode *ip, void *arg)
{
    int error;
    int saverror    = 0;

    /*
     * wrong file system; keep looking
     */
    if (ip->i_ufsvfs != (struct ufsvfs *)arg)
        return (0);

    /*
     * asynchronously push all the dirty pages
     */
    if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
        (error != EAGAIN))
        saverror = error;
    /*
     * wait for io and discard all mappings
     */
    if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
        saverror = error;

    if (ITOV(ip)->v_type == VDIR) {
        dnlc_dir_purge(&ip->i_danchor);
    }

    return (saverror);
}

/*
 * ufs_flush
 *  Flush everything that is currently dirty; this includes invalidating
 *  any mappings.
 */
int
ufs_flush(struct vfs *vfsp)
{
    int     error;
    int     saverror = 0;
    struct ufsvfs   *ufsvfsp    = (struct ufsvfs *)vfsp->vfs_data;
    struct fs   *fs     = ufsvfsp->vfs_fs;
    int     tdontblock = 0;

    ASSERT(vfs_lock_held(vfsp));

    /*
     * purge dnlc
     */
    (void) dnlc_purge_vfsp(vfsp, 0);

    /*
     * drain the delete and idle threads
     */
    ufs_delete_drain(vfsp, 0, 0);
    ufs_idle_drain(vfsp);

    /*
     * flush and invalidate quota records
     */
    (void) qsync(ufsvfsp);

    /*
     * flush w/invalidate the inodes for vfsp
     */
    if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
        saverror = error;

    /*
     * synchronously flush superblock and summary info
     */
    if (fs->fs_ronly == 0 && fs->fs_fmod) {
        fs->fs_fmod = 0;
        TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
    }
    /*
     * flush w/invalidate block device pages and buf cache
     */
    if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
        (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
        saverror = error;

    (void) bflush((dev_t)vfsp->vfs_dev);
    (void) bfinval((dev_t)vfsp->vfs_dev, 0);

    /*
     * drain the delete and idle threads again
     */
    ufs_delete_drain(vfsp, 0, 0);
    ufs_idle_drain(vfsp);

    /*
     * play with the clean flag
     */
    if (saverror == 0)
        ufs_checkclean(vfsp);

    /*
     * Flush any outstanding transactions and roll the log
     * only if we are supposed to do, i.e. LDL_NOROLL not set.
     * We can not simply check for fs_ronly here since fsck also may
     * use this code to roll the log on a read-only filesystem, e.g.
     * root during early stages of boot, if other then a sanity check is
     * done, it will clear LDL_NOROLL before.
     * In addition we assert that the deltamap does not contain any deltas
     * in case LDL_NOROLL is set since this is not supposed to happen.
     */
    if (TRANS_ISTRANS(ufsvfsp)) {
        ml_unit_t   *ul = ufsvfsp->vfs_log;
        mt_map_t    *mtm    = ul->un_deltamap;

        if (ul->un_flags & LDL_NOROLL) {
            ASSERT(mtm->mtm_nme == 0);
        } else {
            /*
             * Do not set T_DONTBLOCK if there is a
             * transaction opened by caller.
             */
            if (curthread->t_flag & T_DONTBLOCK)
                tdontblock = 1;
            else
                curthread->t_flag |= T_DONTBLOCK;

            TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
                TOP_COMMIT_SIZE, error);

            if (!error) {
                TRANS_END_SYNC(ufsvfsp, saverror,
                    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
            }

            if (tdontblock == 0)
                curthread->t_flag &= ~T_DONTBLOCK;

            logmap_roll_dev(ufsvfsp->vfs_log);
        }
    }

    return (saverror);
}

/*
 * ufs_thaw_wlock
 *  special processing when thawing down to wlock
 */
static int
ufs_thaw_wlock(struct inode *ip, void *arg)
{
    /*
     * wrong file system; keep looking
     */
    if (ip->i_ufsvfs != (struct ufsvfs *)arg)
        return (0);

    /*
     * iupdat refuses to clear flags if the fs is read only.  The fs
     * may become read/write during the lock and we wouldn't want
     * these inodes being written to disk.  So clear the flags.
     */
    rw_enter(&ip->i_contents, RW_WRITER);
    ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
    rw_exit(&ip->i_contents);

    /*
     * pages are mlocked -- fail wlock
     */
    if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
        return (EBUSY);

    return (0);
}

/*
 * ufs_thaw_hlock
 *  special processing when thawing down to hlock or elock
 */
static int
ufs_thaw_hlock(struct inode *ip, void *arg)
{
    struct vnode    *vp = ITOV(ip);

    /*
     * wrong file system; keep looking
     */
    if (ip->i_ufsvfs != (struct ufsvfs *)arg)
        return (0);

    /*
     * blow away all pages - even if they are mlocked
     */
    do {
        (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
    } while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
    rw_enter(&ip->i_contents, RW_WRITER);
    ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
    rw_exit(&ip->i_contents);

    return (0);
}

/*
 * ufs_thaw
 *  thaw file system lock down to current value
 */
int
ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
{
    int     error   = 0;
    int     noidel  = (int)(ulp->ul_flag & ULOCKFS_NOIDEL);

    /*
     * if wlock or hlock or elock
     */
    if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
        ULOCKFS_IS_ELOCK(ulp)) {

        /*
         * don't keep access times
         * don't free deleted files
         * if superblock writes are allowed, limit them to me for now
         */
        ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
        if (ulp->ul_sbowner != (kthread_id_t)-1)
            ulp->ul_sbowner = curthread;

        /*
         * wait for writes for deleted files and superblock updates
         */
        (void) ufs_flush(vfsp);

        /*
         * now make sure the quota file is up-to-date
         *  expensive; but effective
         */
        error = ufs_flush(vfsp);
        /*
         * no one can write the superblock
         */
        ulp->ul_sbowner = (kthread_id_t)-1;

        /*
         * special processing for wlock/hlock/elock
         */
        if (ULOCKFS_IS_WLOCK(ulp)) {
            if (error)
                goto errout;
            error = bfinval(ufsvfsp->vfs_dev, 0);
            if (error)
                goto errout;
            error = ufs_scan_inodes(0, ufs_thaw_wlock,
                (void *)ufsvfsp, ufsvfsp);
            if (error)
                goto errout;
        }
        if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
            error = 0;
            (void) ufs_scan_inodes(0, ufs_thaw_hlock,
                (void *)ufsvfsp, ufsvfsp);
            (void) bfinval(ufsvfsp->vfs_dev, 1);
        }
    } else {

        /*
         * okay to keep access times
         * okay to free deleted files
         * okay to write the superblock
         */
        ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
        ulp->ul_sbowner = NULL;

        /*
         * flush in case deleted files are in memory
         */
        if (noidel) {
            if (error = ufs_flush(vfsp))
                goto errout;
        }
    }

errout:
    cv_broadcast(&ulp->ul_cv);
    return (error);
}

/*
 * ufs_reconcile_fs
 *  reconcile incore superblock with ondisk superblock
 */
int
ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
{
    struct fs   *mfs;   /* in-memory superblock */
    struct fs   *dfs;   /* on-disk   superblock */
    struct buf  *bp;    /* on-disk   superblock buf */
    int      needs_unlock;
    char         finished_fsclean;

    mfs = ufsvfsp->vfs_fs;

    /*
     * get the on-disk copy of the superblock
     */
    bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
    bp->b_flags |= (B_STALE|B_AGE);
    if (bp->b_flags & B_ERROR) {
        brelse(bp);
        return (EIO);
    }
    dfs = bp->b_un.b_fs;

    /* error locks may only unlock after the fs has been made consistent */
    if (errlck == UN_ERRLCK) {
        if (dfs->fs_clean == FSFIX) {   /* being repaired */
            brelse(bp);
            return (EAGAIN);
        }
        /* repair not yet started? */
        finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
        if (dfs->fs_clean != finished_fsclean) {
            brelse(bp);
            return (EBUSY);
        }
    }

    /*
     * if superblock has changed too much, abort
     */
    if ((mfs->fs_sblkno     != dfs->fs_sblkno) ||
        (mfs->fs_cblkno     != dfs->fs_cblkno) ||
        (mfs->fs_iblkno     != dfs->fs_iblkno) ||
        (mfs->fs_dblkno     != dfs->fs_dblkno) ||
        (mfs->fs_cgoffset       != dfs->fs_cgoffset) ||
        (mfs->fs_cgmask     != dfs->fs_cgmask) ||
        (mfs->fs_bsize      != dfs->fs_bsize) ||
        (mfs->fs_fsize      != dfs->fs_fsize) ||
        (mfs->fs_frag       != dfs->fs_frag) ||
        (mfs->fs_bmask      != dfs->fs_bmask) ||
        (mfs->fs_fmask      != dfs->fs_fmask) ||
        (mfs->fs_bshift     != dfs->fs_bshift) ||
        (mfs->fs_fshift     != dfs->fs_fshift) ||
        (mfs->fs_fragshift      != dfs->fs_fragshift) ||
        (mfs->fs_fsbtodb        != dfs->fs_fsbtodb) ||
        (mfs->fs_sbsize     != dfs->fs_sbsize) ||
        (mfs->fs_nindir     != dfs->fs_nindir) ||
        (mfs->fs_nspf       != dfs->fs_nspf) ||
        (mfs->fs_trackskew      != dfs->fs_trackskew) ||
        (mfs->fs_cgsize     != dfs->fs_cgsize) ||
        (mfs->fs_ntrak      != dfs->fs_ntrak) ||
        (mfs->fs_nsect      != dfs->fs_nsect) ||
        (mfs->fs_spc        != dfs->fs_spc) ||
        (mfs->fs_cpg        != dfs->fs_cpg) ||
        (mfs->fs_ipg        != dfs->fs_ipg) ||
        (mfs->fs_fpg        != dfs->fs_fpg) ||
        (mfs->fs_postblformat   != dfs->fs_postblformat) ||
        (mfs->fs_magic      != dfs->fs_magic)) {
        brelse(bp);
        return (EACCES);
    }
    if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
        if (mfs->fs_clean == FSLOG) {
            brelse(bp);
            return (EACCES);
        }

    /*
     * get new summary info
     */
    if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
        brelse(bp);
        return (EIO);
    }

    /*
     * release old summary info and update in-memory superblock
     */
    kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
    mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;    /* Only entry 0 used */

    /*
     * update fields allowed to change
     */
    mfs->fs_size        = dfs->fs_size;
    mfs->fs_dsize       = dfs->fs_dsize;
    mfs->fs_ncg     = dfs->fs_ncg;
    mfs->fs_minfree     = dfs->fs_minfree;
    mfs->fs_rotdelay    = dfs->fs_rotdelay;
    mfs->fs_rps     = dfs->fs_rps;
    mfs->fs_maxcontig   = dfs->fs_maxcontig;
    mfs->fs_maxbpg      = dfs->fs_maxbpg;
    mfs->fs_csmask      = dfs->fs_csmask;
    mfs->fs_csshift     = dfs->fs_csshift;
    mfs->fs_optim       = dfs->fs_optim;
    mfs->fs_csaddr      = dfs->fs_csaddr;
    mfs->fs_cssize      = dfs->fs_cssize;
    mfs->fs_ncyl        = dfs->fs_ncyl;
    mfs->fs_cstotal     = dfs->fs_cstotal;
    mfs->fs_reclaim     = dfs->fs_reclaim;

    if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
        mfs->fs_reclaim &= ~FS_RECLAIM;
        mfs->fs_reclaim |=  FS_RECLAIMING;
        ufs_thread_start(&ufsvfsp->vfs_reclaim,
            ufs_thread_reclaim, vfsp);
    }

    /* XXX What to do about sparecon? */

    /* XXX need to copy volume label */

    /*
     * ondisk clean flag overrides inmemory clean flag iff == FSBAD
     * or if error-locked and ondisk is now clean
     */
    needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
    if (needs_unlock)
        mutex_enter(&ufsvfsp->vfs_lock);

    if (errlck == UN_ERRLCK) {
        if (finished_fsclean == dfs->fs_clean)
            mfs->fs_clean = finished_fsclean;
        else
            mfs->fs_clean = FSBAD;
        mfs->fs_state = FSOKAY - dfs->fs_time;
    }

    if (FSOKAY != dfs->fs_state + dfs->fs_time ||
        (dfs->fs_clean == FSBAD))
        mfs->fs_clean = FSBAD;

    if (needs_unlock)
        mutex_exit(&ufsvfsp->vfs_lock);

    brelse(bp);

    return (0);
}

/*
 * ufs_reconcile_inode
 *  reconcile ondisk inode with incore inode
 */
static int
ufs_reconcile_inode(struct inode *ip, void *arg)
{
    int     i;
    int     ndaddr;
    int     niaddr;
    struct dinode   *dp;        /* ondisk inode */
    struct buf  *bp = NULL;
    uid_t       d_uid;
    gid_t       d_gid;
    int     error = 0;
    struct fs   *fs;

    /*
     * not an inode we care about
     */
    if (ip->i_ufsvfs != (struct ufsvfs *)arg)
        return (0);

    fs = ip->i_fs;

    /*
     * Inode reconciliation fails: we made the filesystem quiescent
     * and we did a ufs_flush() before calling ufs_reconcile_inode()
     * and thus the inode should not have been changed inbetween.
     * Any discrepancies indicate a logic error and a pretty
     * significant run-state inconsistency we should complain about.
     */
    if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
        cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
            "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
        return (EINVAL);
    }

    /*
     * get the dinode
     */
    bp = UFS_BREAD(ip->i_ufsvfs,
        ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
        (int)fs->fs_bsize);
    if (bp->b_flags & B_ERROR) {
        brelse(bp);
        return (EIO);
    }
    dp  = bp->b_un.b_dino;
    dp += itoo(fs, ip->i_number);

    /*
     * handle Sun's implementation of EFT
     */
    d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
    d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;

    rw_enter(&ip->i_contents, RW_WRITER);

    /*
     * some fields are not allowed to change
     */
    if ((ip->i_mode  != dp->di_mode) ||
        (ip->i_gen   != dp->di_gen) ||
        (ip->i_uid   != d_uid) ||
        (ip->i_gid   != d_gid)) {
        error = EACCES;
        goto out;
    }

    /*
     * and some are allowed to change
     */
    ip->i_size      = dp->di_size;
    ip->i_ic.ic_flags   = dp->di_ic.ic_flags;
    ip->i_blocks        = dp->di_blocks;
    ip->i_nlink     = dp->di_nlink;
    if (ip->i_flag & IFASTSYMLNK) {
        ndaddr = 1;
        niaddr = 0;
    } else {
        ndaddr = NDADDR;
        niaddr = NIADDR;
    }
    for (i = 0; i < ndaddr; ++i)
        ip->i_db[i] = dp->di_db[i];
    for (i = 0; i < niaddr; ++i)
        ip->i_ib[i] = dp->di_ib[i];

out:
    rw_exit(&ip->i_contents);
    brelse(bp);
    return (error);
}

/*
 * ufs_reconcile
 *  reconcile ondisk superblock/inodes with any incore
 */
static int
ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
{
    int error = 0;

    /*
     * get rid of as much inmemory data as possible
     */
    (void) ufs_flush(vfsp);

    /*
     * reconcile the superblock and inodes
     */
    if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
        return (error);
    if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
        return (error);
    /*
     * allocation blocks may be incorrect; get rid of them
     */
    (void) ufs_flush(vfsp);

    return (error);
}

/*
 * File system locking
 */
int
ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
{
    return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
}

/* kernel-internal interface, also used by fix-on-panic */
int
ufs__fiolfs(
    struct vnode *vp,
    struct lockfs *lockfsp,
    int from_user,
    int from_log)
{
    struct ulockfs  *ulp;
    struct lockfs   lfs;
    int     error;
    struct vfs  *vfsp;
    struct ufsvfs   *ufsvfsp;
    int      errlck     = NO_ERRLCK;
    int      poll_events    = POLLPRI;
    extern struct pollhead ufs_pollhd;
    ulockfs_info_t *head;
    ulockfs_info_t *info;
    int signal = 0;

    /* check valid lock type */
    if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
        return (EINVAL);

    if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
        return (EIO);

    vfsp = vp->v_vfsp;

    if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
        return (EIO);

    /* take the lock and check again */
    vfs_lock_wait(vfsp);
    if (vfsp->vfs_flag & VFS_UNMOUNTED) {
        vfs_unlock(vfsp);
        return (EIO);
    }

    /*
     * Can't wlock or ro/elock fs with accounting or local swap file
     * We need to check for this before we grab the ul_lock to avoid
     * deadlocks with the accounting framework.
     */
    if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
        LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
        if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
            vfs_unlock(vfsp);
            return (EDEADLK);
        }
    }

    ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
    ulp = &ufsvfsp->vfs_ulockfs;
    head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
    SEARCH_ULOCKFSP(head, ulp, info);

    /*
     * Suspend both the reclaim thread and the delete thread.
     * This must be done outside the lockfs locking protocol.
     */
    ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
    ufs_thread_suspend(&ufsvfsp->vfs_delete);

    mutex_enter(&ulp->ul_lock);
    atomic_add_long(&ufs_quiesce_pend, 1);

    /*
     * Quit if there is another lockfs request in progress
     * that is waiting for existing ufs_vnops to complete.
     */
    if (ULOCKFS_IS_BUSY(ulp)) {
        error = EBUSY;
        goto errexit;
    }

    /* cannot ulocked or downgrade a hard-lock */
    if (ULOCKFS_IS_HLOCK(ulp)) {
        error = EIO;
        goto errexit;
    }

    /* an error lock may be unlocked or relocked, only */
    if (ULOCKFS_IS_ELOCK(ulp)) {
        if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
            error = EBUSY;
            goto errexit;
        }
    }

    /*
     * a read-only error lock may only be upgraded to an
     * error lock or hard lock
     */
    if (ULOCKFS_IS_ROELOCK(ulp)) {
        if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
            error = EBUSY;
            goto errexit;
        }
    }

    /*
     * until read-only error locks are fully implemented
     * just return EINVAL
     */
    if (LOCKFS_IS_ROELOCK(lockfsp)) {
        error = EINVAL;
        goto errexit;
    }

    /*
     * an error lock may only be applied if the file system is
     * unlocked or already error locked.
     * (this is to prevent the case where a fs gets changed out from
     * underneath a fs that is locked for backup,
     * that is, name/delete/write-locked.)
     */
    if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
        !ULOCKFS_IS_ROELOCK(ulp)) &&
        (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
        error = EBUSY;
        goto errexit;
    }

    /* get and validate the input lockfs request */
    if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
        goto errexit;

    /*
     * save current ulockfs struct
     */
    bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));

    /*
     * Freeze the file system (pend future accesses)
     */
    ufs_freeze(ulp, lockfsp);

    /*
     * Set locking in progress because ufs_quiesce may free the
     * ul_lock mutex.
     */
    ULOCKFS_SET_BUSY(ulp);
    /* update the ioctl copy */
    LOCKFS_SET_BUSY(&ulp->ul_lockfs);

    /*
     * We  need to unset FWLOCK status before we call ufs_quiesce
     * so that the thread doesnt get suspended. We do this only if
     * this (fallocate) thread requested an unlock operation.
     */
    if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
        if (!ULOCKFS_IS_WLOCK(ulp))
            ULOCKFS_CLR_FWLOCK(ulp);
    }

    /*
     * Quiesce (wait for outstanding accesses to finish)
     */
    if (error = ufs_quiesce(ulp)) {
        /*
         * Interrupted due to signal. There could still be
         * pending vnops.
         */
        signal = 1;

        /*
         * We do broadcast because lock-status
         * could be reverted to old status.
         */
        cv_broadcast(&ulp->ul_cv);
        goto errout;
    }

    /*
     * If the fallocate thread requested a write fs lock operation
     * then we set fwlock status in the ulp.
     */
    if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
        if (ULOCKFS_IS_WLOCK(ulp))
            ULOCKFS_SET_FWLOCK(ulp);
    }

    /*
     * save error lock status to pass down to reconcilation
     * routines and for later cleanup
     */
    if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
        errlck = UN_ERRLCK;

    if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
        int needs_unlock;
        int needs_sbwrite;

        poll_events |= POLLERR;
        errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
            RE_ERRLCK : SET_ERRLCK;

        needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
        if (needs_unlock)
            mutex_enter(&ufsvfsp->vfs_lock);

        /* disable delayed i/o */
        needs_sbwrite = 0;

        if (errlck == SET_ERRLCK) {
            ufsvfsp->vfs_fs->fs_clean = FSBAD;
            needs_sbwrite = 1;
        }

        needs_sbwrite |= ufsvfsp->vfs_dio;
        ufsvfsp->vfs_dio = 0;

        if (needs_unlock)
            mutex_exit(&ufsvfsp->vfs_lock);

        if (needs_sbwrite) {
            ulp->ul_sbowner = curthread;
            TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);

            if (needs_unlock)
                mutex_enter(&ufsvfsp->vfs_lock);

            ufsvfsp->vfs_fs->fs_fmod = 0;

            if (needs_unlock)
                mutex_exit(&ufsvfsp->vfs_lock);
        }
    }

    /*
     * reconcile superblock and inodes if was wlocked
     */
    if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
        if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
            goto errout;
        /*
         * in case the fs grew; reset the metadata map for logging tests
         */
        TRANS_MATA_UMOUNT(ufsvfsp);
        TRANS_MATA_MOUNT(ufsvfsp);
        TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
    }

    /*
     * At least everything *currently* dirty goes out.
     */

    if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
        !ULOCKFS_IS_ELOCK(ulp))
        goto errout;

    /*
     * thaw file system and wakeup pended processes
     */
    if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
        if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
            goto errout;

    /*
     * reset modified flag if not already write locked
     */
    if (!LOCKFS_IS_WLOCK(&lfs))
        ULOCKFS_CLR_MOD(ulp);

    /*
     * idle the lock struct
     */
    ULOCKFS_CLR_BUSY(ulp);
    /* update the ioctl copy */
    LOCKFS_CLR_BUSY(&ulp->ul_lockfs);

    /*
     * free current comment
     */
    if (lfs.lf_comment && lfs.lf_comlen != 0) {
        kmem_free(lfs.lf_comment, lfs.lf_comlen);
        lfs.lf_comment = NULL;
        lfs.lf_comlen = 0;
    }

    /* do error lock cleanup */
    if (errlck == UN_ERRLCK)
        ufsfx_unlockfs(ufsvfsp);

    else if (errlck == RE_ERRLCK)
        ufsfx_lockfs(ufsvfsp);

    /* don't allow error lock from user to invoke panic */
    else if (from_user && errlck == SET_ERRLCK &&
        !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
        (void) ufs_fault(ufsvfsp->vfs_root,
            ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
            ulp->ul_lockfs.lf_comment: "user-applied error lock");

    atomic_add_long(&ufs_quiesce_pend, -1);
    mutex_exit(&ulp->ul_lock);
    vfs_unlock(vfsp);

    if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
        poll_events |= POLLERR;

    pollwakeup(&ufs_pollhd, poll_events);

    /*
     * Allow both the delete thread and the reclaim thread to
     * continue.
     */
    ufs_thread_continue(&ufsvfsp->vfs_delete);
    ufs_thread_continue(&ufsvfsp->vfs_reclaim);

    return (0);

errout:
    /*
     * Lock failed. Reset the old lock in ufsvfs if not hard locked.
     */
    if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
        bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
        ulp->ul_fs_lock = (1 << lfs.lf_lock);
    }

    /*
     * Don't call ufs_thaw() when there's a signal during
     * ufs quiesce operation as it can lead to deadlock
     * with getpage.
     */
    if (signal == 0)
        (void) ufs_thaw(vfsp, ufsvfsp, ulp);

    ULOCKFS_CLR_BUSY(ulp);
    LOCKFS_CLR_BUSY(&ulp->ul_lockfs);

errexit:
    atomic_add_long(&ufs_quiesce_pend, -1);
    mutex_exit(&ulp->ul_lock);
    vfs_unlock(vfsp);

    /*
     * Allow both the delete thread and the reclaim thread to
     * continue.
     */
    ufs_thread_continue(&ufsvfsp->vfs_delete);
    ufs_thread_continue(&ufsvfsp->vfs_reclaim);

    return (error);
}

/*
 * fiolfss
 *  return the current file system locking state info
 */
int
ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
{
    struct ulockfs  *ulp;

    if (!vp || !vp->v_vfsp || !VTOI(vp))
        return (EINVAL);

    /* file system has been forcibly unmounted */
    if (VTOI(vp)->i_ufsvfs == NULL)
        return (EIO);

    ulp = VTOUL(vp);

    if (ULOCKFS_IS_HLOCK(ulp)) {
        *lockfsp = ulp->ul_lockfs;  /* structure assignment */
        return (0);
    }

    mutex_enter(&ulp->ul_lock);

    *lockfsp = ulp->ul_lockfs;  /* structure assignment */

    if (ULOCKFS_IS_MOD(ulp))
        lockfsp->lf_flags |= LOCKFS_MOD;

    mutex_exit(&ulp->ul_lock);

    return (0);
}

/*
 * ufs_check_lockfs
 *  check whether a ufs_vnops conflicts with the file system lock
 */
int
ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
{
    k_sigset_t  smask;
    int     sig, slock;

    ASSERT(MUTEX_HELD(&ulp->ul_lock));

    while (ulp->ul_fs_lock & mask) {
        slock = (int)ULOCKFS_IS_SLOCK(ulp);
        if ((curthread->t_flag & T_DONTPEND) && !slock) {
            curthread->t_flag |= T_WOULDBLOCK;
            return (EAGAIN);
        }
        curthread->t_flag &= ~T_WOULDBLOCK;

        /*
         * In the case of an onerr umount of the fs, threads could
         * have blocked before coming into ufs_check_lockfs and
         * need to check for the special case of ELOCK and
         * vfs_dontblock being set which would indicate that the fs
         * is on its way out and will not return therefore making
         * EIO the appropriate response.
         */
        if (ULOCKFS_IS_HLOCK(ulp) ||
            (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
            return (EIO);

        /*
         * wait for lock status to change
         */
        if (slock || ufsvfsp->vfs_nointr) {
            cv_wait(&ulp->ul_cv, &ulp->ul_lock);
        } else {
            sigintr(&smask, 1);
            sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
            sigunintr(&smask);
            if ((!sig && (ulp->ul_fs_lock & mask)) ||
                ufsvfsp->vfs_dontblock)
                return (EINTR);
        }
    }

    if (mask & ULOCKFS_FWLOCK) {
        atomic_add_long(&ulp->ul_falloc_cnt, 1);
        ULOCKFS_SET_FALLOC(ulp);
    } else {
        atomic_add_long(&ulp->ul_vnops_cnt, 1);
    }

    return (0);
}

/*
 * Check whether we came across the handcrafted lockfs protocol path. We can't
 * simply check for T_DONTBLOCK here as one would assume since this can also
 * falsely catch recursive VOP's going to a different filesystem, instead we
 * check if we already hold the ulockfs->ul_lock mutex.
 */
static int
ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
{
    return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
}

/*
 * ufs_lockfs_begin - start the lockfs locking protocol
 */
int
ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
{
    int         error;
    int     rec_vop;
    ushort_t    op_cnt_incremented = 0;
    ulong_t     *ctr;
    struct ulockfs *ulp;
    ulockfs_info_t  *ulockfs_info;
    ulockfs_info_t  *ulockfs_info_free;
    ulockfs_info_t  *ulockfs_info_temp;

    /*
     * file system has been forcibly unmounted
     */
    if (ufsvfsp == NULL)
        return (EIO);

    *ulpp = ulp = &ufsvfsp->vfs_ulockfs;

    /*
     * Do lockfs protocol
     */
    ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
    IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);

    /*
     * Detect recursive VOP call or handcrafted internal lockfs protocol
     * path and bail out in that case.
     */
    if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
        *ulpp = NULL;
        return (0);
    } else {
        if (ulockfs_info_free == NULL) {
            if ((ulockfs_info_temp = (ulockfs_info_t *)
                kmem_zalloc(sizeof (ulockfs_info_t),
                KM_NOSLEEP)) == NULL) {
                *ulpp = NULL;
                return (ENOMEM);
            }
        }
    }

    /*
     * First time VOP call
     *
     * Increment the ctr irrespective of the lockfs state. If the lockfs
     * state is not ULOCKFS_ULOCK, we can decrement it later. However,
     * before incrementing we need to check if there is a pending quiesce
     * request because if we have a continuous stream of ufs_lockfs_begin
     * requests pounding on a few cpu's then the ufs_quiesce thread might
     * never see the value of zero for ctr - a livelock kind of scenario.
     */
    ctr = (mask & ULOCKFS_FWLOCK) ?
        &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
    if (!ULOCKFS_IS_SLOCK(ulp)) {
        atomic_add_long(ctr, 1);
        op_cnt_incremented++;
    }

    /*
     * If the lockfs state (indicated by ul_fs_lock) is not just
     * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
     * where there is a check with an appropriate mask to selectively allow
     * operations permitted for that kind of lockfs state.
     *
     * Even these selective operations should not be allowed to go through
     * if a lockfs request is in progress because that could result in inode
     * modifications during a quiesce and could hence result in inode
     * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
     * so make use of ufs_quiesce_pend to disallow vnode operations when a
     * quiesce is in progress.
     */
    if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
        if (op_cnt_incremented)
            if (!atomic_add_long_nv(ctr, -1))
                cv_broadcast(&ulp->ul_cv);
        mutex_enter(&ulp->ul_lock);
        error = ufs_check_lockfs(ufsvfsp, ulp, mask);
        mutex_exit(&ulp->ul_lock);
        if (error) {
            if (ulockfs_info_free == NULL)
                kmem_free(ulockfs_info_temp,
                    sizeof (ulockfs_info_t));
            return (error);
        }
    } else {
        /*
         * This is the common case of file system in a unlocked state.
         *
         * If a file system is unlocked, we would expect the ctr to have
         * been incremented by now. But this will not be true when a
         * quiesce is winding up - SLOCK was set when we checked before
         * incrementing the ctr, but by the time we checked for
         * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
         * to take ul_lock and go through the slow path in this uncommon
         * case.
         */
        if (op_cnt_incremented == 0) {
            mutex_enter(&ulp->ul_lock);
            error = ufs_check_lockfs(ufsvfsp, ulp, mask);
            if (error) {
                mutex_exit(&ulp->ul_lock);
                if (ulockfs_info_free == NULL)
                    kmem_free(ulockfs_info_temp,
                        sizeof (ulockfs_info_t));
                return (error);
            }
            if (mask & ULOCKFS_FWLOCK)
                ULOCKFS_SET_FALLOC(ulp);
            mutex_exit(&ulp->ul_lock);
        } else if (mask & ULOCKFS_FWLOCK) {
            mutex_enter(&ulp->ul_lock);
            ULOCKFS_SET_FALLOC(ulp);
            mutex_exit(&ulp->ul_lock);
        }
    }

    if (ulockfs_info_free != NULL) {
        ulockfs_info_free->ulp = ulp;
        if (mask & ULOCKFS_FWLOCK)
            ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
    } else {
        ulockfs_info_temp->ulp = ulp;
        ulockfs_info_temp->next = ulockfs_info;
        if (mask & ULOCKFS_FWLOCK)
            ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
        ASSERT(ufs_lockfs_key != 0);
        (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
    }

    curthread->t_flag |= T_DONTBLOCK;
    return (0);
}

/*
 * Check whether we are returning from the top level VOP.
 */
static int
ufs_lockfs_top_vop_return(ulockfs_info_t *head)
{
    ulockfs_info_t *info;
    int result = 1;

    for (info = head; info != NULL; info = info->next) {
        if (info->ulp != NULL) {
            result = 0;
            break;
        }
    }

    return (result);
}

/*
 * ufs_lockfs_end - terminate the lockfs locking protocol
 */
void
ufs_lockfs_end(struct ulockfs *ulp)
{
    ulockfs_info_t *info;
    ulockfs_info_t *head;

    /*
     * end-of-VOP protocol
     */
    if (ulp == NULL)
        return;

    head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
    SEARCH_ULOCKFSP(head, ulp, info);

    /*
     * If we're called from a first level VOP, we have to have a
     * valid ulockfs record in the TSD.
     */
    ASSERT(info != NULL);

    /*
     * Invalidate the ulockfs record.
     */
    info->ulp = NULL;

    if (ufs_lockfs_top_vop_return(head))
        curthread->t_flag &= ~T_DONTBLOCK;

    /* fallocate thread */
    if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
        /* Clear the thread's fallocate state */
        info->flags &= ~ULOCK_INFO_FALLOCATE;
        if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) {
            mutex_enter(&ulp->ul_lock);
            ULOCKFS_CLR_FALLOC(ulp);
            cv_broadcast(&ulp->ul_cv);
            mutex_exit(&ulp->ul_lock);
        }
    } else  { /* normal thread */
        if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
            cv_broadcast(&ulp->ul_cv);
    }
}

/*
 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
 * blocking.
 */
int
ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
{
    int         error = 0;
    int     rec_vop;
    ushort_t    op_cnt_incremented = 0;
    ulong_t     *ctr;
    struct ulockfs *ulp;
    ulockfs_info_t  *ulockfs_info;
    ulockfs_info_t  *ulockfs_info_free;
    ulockfs_info_t  *ulockfs_info_temp;

    /*
     * file system has been forcibly unmounted
     */
    if (ufsvfsp == NULL)
        return (EIO);

    *ulpp = ulp = &ufsvfsp->vfs_ulockfs;

    /*
     * Do lockfs protocol
     */
    ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
    IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);

    /*
     * Detect recursive VOP call or handcrafted internal lockfs protocol
     * path and bail out in that case.
     */
    if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
        *ulpp = NULL;
        return (0);
    } else {
        if (ulockfs_info_free == NULL) {
            if ((ulockfs_info_temp = (ulockfs_info_t *)
                kmem_zalloc(sizeof (ulockfs_info_t),
                KM_NOSLEEP)) == NULL) {
                *ulpp = NULL;
                return (ENOMEM);
            }
        }
    }

    /*
     * First time VOP call
     *
     * Increment the ctr irrespective of the lockfs state. If the lockfs
     * state is not ULOCKFS_ULOCK, we can decrement it later. However,
     * before incrementing we need to check if there is a pending quiesce
     * request because if we have a continuous stream of ufs_lockfs_begin
     * requests pounding on a few cpu's then the ufs_quiesce thread might
     * never see the value of zero for ctr - a livelock kind of scenario.
     */
    ctr = (mask & ULOCKFS_FWLOCK) ?
        &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
    if (!ULOCKFS_IS_SLOCK(ulp)) {
        atomic_add_long(ctr, 1);
        op_cnt_incremented++;
    }

    if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
        /*
         * Non-blocking version of ufs_check_lockfs() code.
         *
         * If the file system is not hard locked or error locked
         * and if ulp->ul_fs_lock allows this operation, increment
         * the appropriate counter and proceed (For eg., In case the
         * file system is delete locked, a mmap can still go through).
         */
        if (op_cnt_incremented)
            if (!atomic_add_long_nv(ctr, -1))
                cv_broadcast(&ulp->ul_cv);
        mutex_enter(&ulp->ul_lock);
        if (ULOCKFS_IS_HLOCK(ulp) ||
            (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
            error = EIO;
        else if (ulp->ul_fs_lock & mask)
            error = EAGAIN;

        if (error) {
            mutex_exit(&ulp->ul_lock);
            if (ulockfs_info_free == NULL)
                kmem_free(ulockfs_info_temp,
                    sizeof (ulockfs_info_t));
            return (error);
        }
        atomic_add_long(ctr, 1);
        if (mask & ULOCKFS_FWLOCK)
            ULOCKFS_SET_FALLOC(ulp);
        mutex_exit(&ulp->ul_lock);
    } else {
        /*
         * This is the common case of file system in a unlocked state.
         *
         * If a file system is unlocked, we would expect the ctr to have
         * been incremented by now. But this will not be true when a
         * quiesce is winding up - SLOCK was set when we checked before
         * incrementing the ctr, but by the time we checked for
         * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
         * ul_lock and go through the non-blocking version of
         * ufs_check_lockfs() code.
         */
        if (op_cnt_incremented == 0) {
            mutex_enter(&ulp->ul_lock);
            if (ULOCKFS_IS_HLOCK(ulp) ||
                (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
                error = EIO;
            else if (ulp->ul_fs_lock & mask)
                error = EAGAIN;

            if (error) {
                mutex_exit(&ulp->ul_lock);
                if (ulockfs_info_free == NULL)
                    kmem_free(ulockfs_info_temp,
                        sizeof (ulockfs_info_t));
                return (error);
            }
            atomic_add_long(ctr, 1);
            if (mask & ULOCKFS_FWLOCK)
                ULOCKFS_SET_FALLOC(ulp);
            mutex_exit(&ulp->ul_lock);
        } else if (mask & ULOCKFS_FWLOCK) {
            mutex_enter(&ulp->ul_lock);
            ULOCKFS_SET_FALLOC(ulp);
            mutex_exit(&ulp->ul_lock);
        }
    }

    if (ulockfs_info_free != NULL) {
        ulockfs_info_free->ulp = ulp;
        if (mask & ULOCKFS_FWLOCK)
            ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
    } else {
        ulockfs_info_temp->ulp = ulp;
        ulockfs_info_temp->next = ulockfs_info;
        if (mask & ULOCKFS_FWLOCK)
            ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
        ASSERT(ufs_lockfs_key != 0);
        (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
    }

    curthread->t_flag |= T_DONTBLOCK;
    return (0);
}

/*
 * specialized version of ufs_lockfs_begin() called by ufs_getpage().
 */
int
ufs_lockfs_begin_getpage(
    struct ufsvfs   *ufsvfsp,
    struct ulockfs  **ulpp,
    struct seg  *seg,
    int     read_access,
    uint_t      *protp)
{
    ulong_t         mask;
    int             error;
    int         rec_vop;
    struct ulockfs      *ulp;
    ulockfs_info_t      *ulockfs_info;
    ulockfs_info_t      *ulockfs_info_free;
    ulockfs_info_t      *ulockfs_info_temp;

    /*
     * file system has been forcibly unmounted
     */
    if (ufsvfsp == NULL)
        return (EIO);

    *ulpp = ulp = &ufsvfsp->vfs_ulockfs;

    /*
     * Do lockfs protocol
     */
    ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
    IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);

    /*
     * Detect recursive VOP call or handcrafted internal lockfs protocol
     * path and bail out in that case.
     */
    if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
        *ulpp = NULL;
        return (0);
    } else {
        if (ulockfs_info_free == NULL) {
            if ((ulockfs_info_temp = (ulockfs_info_t *)
                kmem_zalloc(sizeof (ulockfs_info_t),
                KM_NOSLEEP)) == NULL) {
                *ulpp = NULL;
                return (ENOMEM);
            }
        }
    }

    /*
     * First time VOP call
     */
    atomic_add_long(&ulp->ul_vnops_cnt, 1);
    if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
        if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
            cv_broadcast(&ulp->ul_cv);
        mutex_enter(&ulp->ul_lock);
        if (seg->s_ops == &segvn_ops &&
            ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
            mask = (ulong_t)ULOCKFS_GETREAD_MASK;
        } else if (protp && read_access) {
            /*
             * Restrict the mapping to readonly.
             * Writes to this mapping will cause
             * another fault which will then
             * be suspended if fs is write locked
             */
            *protp &= ~PROT_WRITE;
            mask = (ulong_t)ULOCKFS_GETREAD_MASK;
        } else
            mask = (ulong_t)ULOCKFS_GETWRITE_MASK;

        /*
         * will sleep if this fs is locked against this VOP
         */
        error = ufs_check_lockfs(ufsvfsp, ulp, mask);
        mutex_exit(&ulp->ul_lock);
        if (error) {
            if (ulockfs_info_free == NULL)
                kmem_free(ulockfs_info_temp,
                    sizeof (ulockfs_info_t));
            return (error);
        }
    }

    if (ulockfs_info_free != NULL) {
        ulockfs_info_free->ulp = ulp;
    } else {
        ulockfs_info_temp->ulp = ulp;
        ulockfs_info_temp->next = ulockfs_info;
        ASSERT(ufs_lockfs_key != 0);
        (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
    }

    curthread->t_flag |= T_DONTBLOCK;
    return (0);
}

void
ufs_lockfs_tsd_destructor(void *head)
{
    ulockfs_info_t *curr = (ulockfs_info_t *)head;
    ulockfs_info_t *temp;

    for (; curr != NULL; ) {
        /*
         * The TSD destructor is being called when the thread exits
         * (via thread_exit()). At that time it must have cleaned up
         * all VOPs via ufs_lockfs_end() and there must not be a
         * valid ulockfs record exist while a thread is exiting.
         */
        temp = curr;
        curr = curr->next;
        ASSERT(temp->ulp == NULL);
        kmem_free(temp, sizeof (ulockfs_info_t));
    }
}