fs/ufs/ufs_bmap.c

	ufs_bmap.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*  Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/*    All Rights Reserved   */

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */


#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/user.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/disp.h>
#include <sys/proc.h>
#include <sys/conf.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_bio.h>
#include <vm/seg.h>
#include <sys/errno.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/kmem.h>

/*
 * This structure is used to track blocks as we allocate them, so that
 * we can free them if we encounter an error during allocation.  We
 * keep track of five pieces of information for each allocated block:
 *   - The number of the newly allocated block
 *   - The size of the block (lets us deal with fragments if we want)
 *   - The number of the block containing a pointer to it; or whether
 *     the pointer is in the inode
 *   - The offset within the block (or inode) containing a pointer to it.
 *   - A flag indicating the usage of the block.  (Logging needs to know
 *     this to avoid overwriting a data block if it was previously used
 *     for metadata.)
 */

enum ufs_owner_type {
    ufs_no_owner,       /* Owner has not yet been updated */
    ufs_inode_direct,   /* Listed in inode's direct block table */
    ufs_inode_indirect, /* Listed in inode's indirect block table */
    ufs_indirect_block  /* Listed in an indirect block */
};

struct ufs_allocated_block {
    daddr_t this_block;     /* Number of this block */
    off_t block_size;       /* Size of this block, in bytes */
    enum ufs_owner_type owner;  /* Who points to this block? */
    daddr_t owner_block;        /* Number of the owning block */
    uint_t owner_offset;        /* Offset within that block or inode */
    int usage_flags;        /* Usage flags, as expected by free() */
};


static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
        int maxtrans);

static void ufs_undo_allocation(inode_t *ip, int block_count,
    struct ufs_allocated_block table[], int inode_sector_adjust);

/*
 * Find the extent and the matching block number.
 *
 * bsize > PAGESIZE
 *  boff indicates that we want a page in the middle
 *  min expression is supposed to make sure no extra page[s] after EOF
 * PAGESIZE >= bsize
 *  we assume that a page is a multiple of bsize, i.e.,
 *  boff always == 0
 *
 * We always return a length that is suitable for a disk transfer.
 */
#define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
    register daddr32_t *dp = (tblp);                \
    register int _chkfrag = chkfrag; /* for lint. sigh */       \
                                    \
    if (*dp == 0) {                         \
        *(bnp) = UFS_HOLE;                  \
    } else {                            \
        register int len;                   \
                                    \
        len = findextent(fs, dp, (int)(n), lenp, maxtrans) <<   \
            (fs)->fs_bshift;                \
        if (_chkfrag) {                     \
            register u_offset_t tmp;            \
                                    \
            tmp = fragroundup((fs), size) -         \
                (((u_offset_t)lbn) << fs->fs_bshift);   \
            len = (int)MIN(tmp, len);           \
        }                           \
        len -= (boff);                      \
        if (len <= 0) {                     \
            *(bnp) = UFS_HOLE;              \
        } else {                        \
            *(bnp) = fsbtodb(fs, *dp) + btodb(boff);    \
            *(lenp) = len;                  \
        }                           \
    }                               \
}

/*
 * The maximum supported file size is actually somewhat less that 1
 * terabyte.  This is because the total number of blocks used for the
 * file and its metadata must fit into the ic_blocks field of the
 * inode, which is a signed 32-bit quantity.  The metadata allocated
 * for a file (that is, the single, double, and triple indirect blocks
 * used to reference the file blocks) is actually quite small,
 * but just to make sure, we check for overflow in the ic_blocks
 * ic_blocks fields for all files whose total block count is
 * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
 * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
 * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
 * field if the number of blocks currently allocated to the file is
 * greater than VERYLARGEFILESIZE.
 *
 * Note that file "size" is the not the same as file "length".  A
 * file's "size" is the number of blocks allocated to it.  A file's
 * "length" is the maximum offset in the file.  A UFS FILE can have a
 * length of a terabyte, but the size is limited to somewhat less than
 * a terabyte, as described above.
 */
#define VERYLARGEFILESIZE   0x7FE00000

/*
 * bmap{rd,wr} define the structure of file system storage by mapping
 * a logical offset in a file to a physical block number on the device.
 * It should be called with a locked inode when allocation is to be
 * done (bmapwr).  Note this strangeness: bmapwr is always called from
 * getpage(), not putpage(), since getpage() is where all the allocation
 * is done.
 *
 * S_READ, S_OTHER -> bmaprd; S_WRITE -> bmapwr.
 *
 * NOTICE: the block number returned is the disk block number, not the
 * file system block number.  All the worries about block offsets and
 * page/block sizes are hidden inside of bmap.  Well, not quite,
 * unfortunately.  It's impossible to find one place to hide all this
 * mess.  There are 3 cases:
 *
 * PAGESIZE < bsize
 *  In this case, the {get,put}page routines will attempt to align to
 *  a file system block boundry (XXX - maybe this is a mistake?).  Since
 *  the kluster routines may be out of memory, we don't always get all
 *  the pages we wanted.  If we called bmap first, to find out how much
 *  to kluster, we handed in the block aligned offset.  If we didn't get
 *  all the pages, we have to chop off the amount we didn't get from the
 *  amount handed back by bmap.
 *
 * PAGESIZE == bsize
 *  Life is quite pleasant here, no extra work needed, mainly because we
 *  (probably?) won't kluster backwards, just forwards.
 *
 * PAGESIZE > bsize
 *  This one has a different set of problems, specifically, we may have to
 *  do N reads to fill one page.  Let us hope that Sun will stay with small
 *  pages.
 *
 * Returns 0 on success, or a non-zero errno if an error occurs.
 *
 * TODO
 *  LMXXX - add a bmap cache.  This could be a couple of extents in the
 *  inode.  Two is nice for PAGESIZE > bsize.
 */

int
bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
{
    daddr_t lbn;
    ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
    struct  fs *fs = ufsvfsp->vfs_fs;
    struct  buf *bp;
    int i, j, boff;
    int shft;           /* we maintain sh = 1 << shft */
    daddr_t ob, nb, tbn;
    daddr32_t *bap;
    int nindirshift, nindiroffset;

    ASSERT(RW_LOCK_HELD(&ip->i_contents));
    lbn = (daddr_t)lblkno(fs, off);
    boff = (int)blkoff(fs, off);
    if (lbn < 0)
        return (EFBIG);

    /*
     * The first NDADDR blocks are direct blocks.
     */
    if (lbn < NDADDR) {
        DOEXTENT(fs, lbn, boff, bnp, lenp,
            ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
            ufsvfsp->vfs_iotransz);
        return (0);
    }

    nindirshift = ufsvfsp->vfs_nindirshift;
    nindiroffset = ufsvfsp->vfs_nindiroffset;
    /*
     * Determine how many levels of indirection.
     */
    shft = 0;               /* sh = 1 */
    tbn = lbn - NDADDR;
    for (j = NIADDR; j > 0; j--) {
        longlong_t  sh;

        shft += nindirshift;        /* sh *= nindir */
        sh = 1LL << shft;
        if (tbn < sh)
            break;
        tbn -= sh;
    }
    if (j == 0)
        return (EFBIG);

    /*
     * Fetch the first indirect block.
     */
    nb = ip->i_ib[NIADDR - j];
    if (nb == 0) {
        *bnp = UFS_HOLE;
        return (0);
    }

    /*
     * Fetch through the indirect blocks.
     */
    for (; j <= NIADDR; j++) {
        ob = nb;
        bp = UFS_BREAD(ufsvfsp,
                ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
        if (bp->b_flags & B_ERROR) {
            brelse(bp);
            return (EIO);
        }
        bap = bp->b_un.b_daddr;

        ASSERT(!ufs_indir_badblock(ip, bap));

        shft -= nindirshift;        /* sh / nindir */
        i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
        nb = bap[i];
        if (nb == 0) {
            *bnp = UFS_HOLE;
            brelse(bp);
            return (0);
        }
        if (j != NIADDR)
            brelse(bp);
    }
    DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
        MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
        0, ufsvfsp->vfs_iotransz);
    brelse(bp);
    return (0);
}

/*
 * See bmaprd for general notes.
 *
 * The block must be at least size bytes and will be extended or
 * allocated as needed.  If alloc_only is set, bmap will not create
 * any in-core pages that correspond to the new disk allocation.
 * Otherwise, the in-core pages will be created and initialized as
 * needed.
 *
 * Returns 0 on success, or a non-zero errno if an error occurs.
 */

int
bmap_write(
    struct inode    *ip,
    u_offset_t  off,
    int     size,
    int     alloc_only,
    struct cred *cr)
{
    struct  fs *fs;
    struct  buf *bp;
    int i;
    struct  buf *nbp;
    int j;
    int shft;               /* we maintain sh = 1 << shft */
    daddr_t ob, nb, pref, lbn, llbn, tbn;
    daddr32_t *bap;
    struct  vnode *vp = ITOV(ip);
    long    bsize = VBSIZE(vp);
    long    osize, nsize;
    int issync, metaflag, isdirquota;
    int err;
    dev_t   dev;
    struct  fbuf *fbp;
    int nindirshift;
    int nindiroffset;
    struct  ufsvfs  *ufsvfsp;
    int added_sectors;      /* sectors added to this inode */
    int alloced_blocks;     /* fs blocks newly allocated */
    struct  ufs_allocated_block undo_table[NIADDR+1];
    int verylargefile = 0;

    ASSERT(RW_WRITE_HELD(&ip->i_contents));

    ufsvfsp = ip->i_ufsvfs;
    fs = ufsvfsp->vfs_bufp->b_un.b_fs;
    lbn = (daddr_t)lblkno(fs, off);
    if (lbn < 0)
        return (EFBIG);
    if (ip->i_blocks >= VERYLARGEFILESIZE)
        verylargefile = 1;
    llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
    metaflag = isdirquota = 0;
    if (((ip->i_mode & IFMT) == IFDIR) ||
        ((ip->i_mode & IFMT) == IFATTRDIR))
        isdirquota = metaflag = I_DIR;
    else if ((ip->i_mode & IFMT) == IFSHAD)
        metaflag = I_SHAD;
    else if (ip->i_ufsvfs->vfs_qinod == ip)
        isdirquota = metaflag = I_QUOTA;

    issync = ((ip->i_flag & ISYNC) != 0);

    if (isdirquota || issync) {
        alloc_only = 0;     /* make sure */
    }

    /*
     * If the next write will extend the file into a new block,
     * and the file is currently composed of a fragment
     * this fragment has to be extended to be a full block.
     */
    if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
        osize = blksize(fs, ip, llbn);
        if (osize < bsize && osize > 0) {
            /*
             * Check to see if doing this will make the file too
             * big.  Only check if we are dealing with a very
             * large file.
             */
            if (verylargefile == 1) {
                if (((unsigned)ip->i_blocks +
                    btodb(bsize - osize)) > INT_MAX) {
                    return (EFBIG);
                }
            }
            /*
             * Make sure we have all needed pages setup correctly.
             *
             * We pass S_OTHER to fbread here because we want
             * an exclusive lock on the page in question
             * (see ufs_getpage). I/O to the old block location
             * may still be in progress and we are about to free
             * the old block. We don't want anyone else to get
             * a hold of the old block once we free it until
             * the I/O is complete.
             */
            err = fbread(ITOV(ip),
                    ((offset_t)llbn << fs->fs_bshift),
                    (uint_t)bsize, S_OTHER, &fbp);
            if (err)
                return (err);
            pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
            err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
                    &nb, cr);
            if (err) {
                if (fbp)
                    fbrelse(fbp, S_OTHER);
                return (err);
            }
            ASSERT(!ufs_badblock(ip, nb));

            /*
             * Update the inode before releasing the
             * lock on the page. If we released the page
             * lock first, the data could be written to it's
             * old address and then destroyed.
             */
            TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
            ip->i_db[llbn] = nb;
            UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
                ip);
            ip->i_blocks += btodb(bsize - osize);
            ASSERT((unsigned)ip->i_blocks <= INT_MAX);
            TRANS_INODE(ufsvfsp, ip);
            ip->i_flag |= IUPD | ICHG | IATTCHG;
            /* Caller is responsible for updating i_seq */
            /*
             * Don't check metaflag here, directories won't do this
             *
             */
            if (issync) {
                (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
            } else {
                ASSERT(fbp);
                fbrelse(fbp, S_WRITE);
            }

            if (nb != ob) {
                (void) free(ip, ob, (off_t)osize, metaflag);
            }
        }
    }

    /*
     * The first NDADDR blocks are direct blocks.
     */
    if (lbn < NDADDR) {
        nb = ip->i_db[lbn];
        if (nb == 0 ||
            ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
            if (nb != 0) {
                /* consider need to reallocate a frag */
                osize = fragroundup(fs, blkoff(fs, ip->i_size));
                nsize = fragroundup(fs, size);
                if (nsize <= osize)
                    goto gotit;
                /*
                 * Check to see if doing this will make the
                 * file too big.  Only check if we are dealing
                 * with a very large file.
                 */
                if (verylargefile == 1) {
                    if (((unsigned)ip->i_blocks +
                        btodb(nsize - osize)) > INT_MAX) {
                        return (EFBIG);
                    }
                }
                /*
                 * need to allocate a block or frag
                 */
                ob = nb;
                pref = blkpref(ip, lbn, (int)lbn,
                                &ip->i_db[0]);
                err = realloccg(ip, ob, pref, (int)osize,
                        (int)nsize, &nb, cr);
                if (err)
                    return (err);
                ASSERT(!ufs_badblock(ip, nb));

            } else {
                /*
                 * need to allocate a block or frag
                 */
                osize = 0;
                if (ip->i_size <
                    ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
                    nsize = fragroundup(fs, size);
                else
                    nsize = bsize;
                /*
                 * Check to see if doing this will make the
                 * file too big.  Only check if we are dealing
                 * with a very large file.
                 */
                if (verylargefile == 1) {
                    if (((unsigned)ip->i_blocks +
                        btodb(nsize - osize)) > INT_MAX) {
                        return (EFBIG);
                    }
                }
                pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
                err = alloc(ip, pref, (int)nsize, &nb, cr);
                if (err)
                    return (err);
                ASSERT(!ufs_badblock(ip, nb));
                ob = nb;
            }

            /*
             * Read old/create new zero pages
             */
            fbp = NULL;
            if (osize == 0) {
                /*
                 * mmap S_WRITE faults always enter here
                 */
                if (!alloc_only || P2ROUNDUP_TYPED(size,
                    PAGESIZE, u_offset_t) < nsize) {
                    /* fbzero doesn't cause a pagefault */
                    fbzero(ITOV(ip),
                        ((offset_t)lbn << fs->fs_bshift),
                        (uint_t)nsize, &fbp);
                }
            } else {
                err = fbread(vp,
                    ((offset_t)lbn << fs->fs_bshift),
                    (uint_t)nsize, S_OTHER, &fbp);
                if (err) {
                    if (nb != ob) {
                        (void) free(ip, nb,
                            (off_t)nsize, metaflag);
                    } else {
                        (void) free(ip,
                            ob + numfrags(fs, osize),
                            (off_t)(nsize - osize),
                            metaflag);
                    }
                    ASSERT(nsize >= osize);
                    (void) chkdq(ip,
                        -(long)btodb(nsize - osize),
                        0, cr, (char **)NULL,
                        (size_t *)NULL);
                    return (err);
                }
            }
            TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
            ip->i_db[lbn] = nb;
            ip->i_blocks += btodb(nsize - osize);
            ASSERT((unsigned)ip->i_blocks <= INT_MAX);
            TRANS_INODE(ufsvfsp, ip);
            ip->i_flag |= IUPD | ICHG | IATTCHG;
            /* Caller is responsible for updating i_seq */

            /*
             * Write directory and shadow blocks synchronously so
             * that they never appear with garbage in them on the
             * disk.
             *
             */
            if (isdirquota && (ip->i_size ||
                TRANS_ISTRANS(ufsvfsp))) {
            /*
             * XXX man not be necessary with harpy trans
             * bug id 1130055
             */
                (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
            } else if (fbp) {
                fbrelse(fbp, S_WRITE);
            }

            if (nb != ob)
                (void) free(ip, ob, (off_t)osize, metaflag);
        }
gotit:
        return (0);
    }

    added_sectors = alloced_blocks = 0; /* No blocks alloced yet */

    /*
     * Determine how many levels of indirection.
     */
    nindirshift = ip->i_ufsvfs->vfs_nindirshift;
    nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
    pref = 0;
    shft = 0;               /* sh = 1 */
    tbn = lbn - NDADDR;
    for (j = NIADDR; j > 0; j--) {
        longlong_t  sh;

        shft += nindirshift;        /* sh *= nindir */
        sh = 1LL << shft;
        if (tbn < sh)
            break;
        tbn -= sh;
    }

    if (j == 0)
        return (EFBIG);

    /*
     * Fetch the first indirect block.
     */
    dev = ip->i_dev;
    nb = ip->i_ib[NIADDR - j];
    if (nb == 0) {
        /*
         * Check to see if doing this will make the
         * file too big.  Only check if we are dealing
         * with a very large file.
         */
        if (verylargefile == 1) {
            if (((unsigned)ip->i_blocks + btodb(bsize))
                > INT_MAX) {
                return (EFBIG);
            }
        }
        /*
         * Need to allocate an indirect block.
         */
        pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
        err = alloc(ip, pref, (int)bsize, &nb, cr);
        if (err)
            return (err);
        TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
        ASSERT(!ufs_badblock(ip, nb));

        /*
         * Keep track of this allocation so we can undo it if we
         * get an error later.
         */

        ASSERT(alloced_blocks <= NIADDR);

        undo_table[alloced_blocks].this_block = nb;
        undo_table[alloced_blocks].block_size = bsize;
        undo_table[alloced_blocks].owner = ufs_no_owner;
        undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;

        alloced_blocks++;

        /*
         * Write zero block synchronously so that
         * indirect blocks never point at garbage.
         */
        bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);

        clrbuf(bp);
        /* XXX Maybe special-case this? */
        TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
        UFS_BWRITE2(ufsvfsp, bp);
        if (bp->b_flags & B_ERROR) {
            err = geterror(bp);
            brelse(bp);
            ufs_undo_allocation(ip, alloced_blocks,
                undo_table, added_sectors);
            return (err);
        }
        brelse(bp);

        ip->i_ib[NIADDR - j] = nb;
        added_sectors += btodb(bsize);
        ip->i_blocks += btodb(bsize);
        ASSERT((unsigned)ip->i_blocks <= INT_MAX);
        TRANS_INODE(ufsvfsp, ip);
        ip->i_flag |= IUPD | ICHG | IATTCHG;
        /* Caller is responsible for updating i_seq */

        /*
         * Update the 'undo table' now that we've linked this block
         * to an inode.
         */

        undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
        undo_table[alloced_blocks-1].owner_offset = NIADDR - j;

        /*
         * In the ISYNC case, wrip will notice that the block
         * count on the inode has changed and will be sure to
         * ufs_iupdat the inode at the end of wrip.
         */
    }

    /*
     * Fetch through the indirect blocks.
     */
    for (; j <= NIADDR; j++) {
        ob = nb;
        bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);

        if (bp->b_flags & B_ERROR) {
            err = geterror(bp);
            brelse(bp);
            /*
             * Return any partial allocations.
             *
             * It is possible that we have not yet made any
             * allocations at this point (if this is the first
             * pass through the loop and we didn't have to
             * allocate the first indirect block, above).
             * In this case, alloced_blocks and added_sectors will
             * be zero, and ufs_undo_allocation will do nothing.
             */
            ufs_undo_allocation(ip, alloced_blocks,
                undo_table, added_sectors);
            return (err);
        }
        bap = bp->b_un.b_daddr;
        shft -= nindirshift;        /* sh /= nindir */
        i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
        nb = bap[i];
        if (nb == 0) {
            /*
             * Check to see if doing this will make the
             * file too big.  Only check if we are dealing
             * with a very large file.
             */
            if (verylargefile == 1) {
                if (((unsigned)ip->i_blocks + btodb(bsize))
                    > INT_MAX) {
                    brelse(bp);
                    ufs_undo_allocation(ip, alloced_blocks,
                        undo_table, added_sectors);
                    return (EFBIG);
                }
            }
            if (pref == 0) {
                if (j < NIADDR) {
                    /* Indirect block */
                    pref = blkpref(ip, lbn, 0,
                        (daddr32_t *)0);
                } else {
                    /* Data block */
                    pref = blkpref(ip, lbn, i, &bap[0]);
                }
            }

            /*
             * release "bp" buf to avoid deadlock (re-bread later)
             */
            brelse(bp);

            err = alloc(ip, pref, (int)bsize, &nb, cr);
            if (err) {
                /*
                 * Return any partial allocations.
                 */
                ufs_undo_allocation(ip, alloced_blocks,
                    undo_table, added_sectors);
                return (err);
            }

            ASSERT(!ufs_badblock(ip, nb));

            ASSERT(alloced_blocks <= NIADDR);

            undo_table[alloced_blocks].this_block = nb;
            undo_table[alloced_blocks].block_size = bsize;
            undo_table[alloced_blocks].owner = ufs_no_owner;
            undo_table[alloced_blocks].usage_flags = metaflag |
                ((j < NIADDR) ? I_IBLK : 0);

            alloced_blocks++;

            if (j < NIADDR) {
                TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
                /*
                 * Write synchronously so indirect
                 * blocks never point at garbage.
                 */
                nbp = UFS_GETBLK(
                    ufsvfsp, dev, fsbtodb(fs, nb), bsize);

                clrbuf(nbp);
                /* XXX Maybe special-case this? */
                TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
                UFS_BWRITE2(ufsvfsp, nbp);
                if (nbp->b_flags & B_ERROR) {
                    err = geterror(nbp);
                    brelse(nbp);
                    /*
                     * Return any partial
                     * allocations.
                     */
                    ufs_undo_allocation(ip,
                        alloced_blocks,
                        undo_table, added_sectors);
                    return (err);
                }
                brelse(nbp);
            } else if (!alloc_only || P2ROUNDUP_TYPED(size,
                PAGESIZE, u_offset_t) < bsize) {
                TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
                fbzero(ITOV(ip),
                    ((offset_t)lbn << fs->fs_bshift),
                    (uint_t)bsize, &fbp);

                /*
                 * Cases which we need to do a synchronous
                 * write of the zeroed data pages:
                 *
                 * 1) If we are writing a directory then we
                 * want to write synchronously so blocks in
                 * directories never contain garbage.
                 *
                 * 2) If we are filling in a hole and the
                 * indirect block is going to be synchronously
                 * written back below we need to make sure
                 * that the zeroes are written here before
                 * the indirect block is updated so that if
                 * we crash before the real data is pushed
                 * we will not end up with random data is
                 * the middle of the file.
                 *
                 * 3) If the size of the request rounded up
                 * to the system page size is smaller than
                 * the file system block size, we want to
                 * write out all the pages now so that
                 * they are not aborted before they actually
                 * make it to ufs_putpage since the length
                 * of the inode will not include the pages.
                 */

                if (isdirquota || (issync &&
                    lbn < llbn))
                    (void) ufs_fbiwrite(fbp, ip, nb,
                        fs->fs_fsize);
                else
                    fbrelse(fbp, S_WRITE);
            }

            /*
             * re-acquire "bp" buf
             */
            bp = UFS_BREAD(ufsvfsp,
                    ip->i_dev, fsbtodb(fs, ob), bsize);
            if (bp->b_flags & B_ERROR) {
                err = geterror(bp);
                brelse(bp);
                /*
                 * Return any partial allocations.
                 */
                ufs_undo_allocation(ip,
                    alloced_blocks,
                    undo_table, added_sectors);
                return (err);
            }
            bap = bp->b_un.b_daddr;
            bap[i] = nb;
            TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
            added_sectors += btodb(bsize);
            ip->i_blocks += btodb(bsize);
            ASSERT((unsigned)ip->i_blocks <= INT_MAX);
            TRANS_INODE(ufsvfsp, ip);
            ip->i_flag |= IUPD | ICHG | IATTCHG;
            /* Caller is responsible for updating i_seq */

            undo_table[alloced_blocks-1].owner =
                ufs_indirect_block;
            undo_table[alloced_blocks-1].owner_block = ob;
            undo_table[alloced_blocks-1].owner_offset = i;

            if (issync) {
                UFS_BWRITE2(ufsvfsp, bp);
                if (bp->b_flags & B_ERROR) {
                    err = geterror(bp);
                    brelse(bp);
                    /*
                     * Return any partial
                     * allocations.
                     */
                    ufs_undo_allocation(ip,
                        alloced_blocks,
                        undo_table, added_sectors);
                    return (err);
                }
                brelse(bp);
            } else {
                bdrwrite(bp);
            }
        } else {
            brelse(bp);
        }
    }
    return (0);
}

/*
 * Return 1 if inode has unmapped blocks (UFS holes).
 */
int
bmap_has_holes(struct inode *ip)
{
    struct fs *fs = ip->i_fs;
    uint_t  dblks;          /* # of data blocks */
    uint_t  mblks;          /* # of data + metadata blocks */
    int nindirshift;
    int nindiroffset;
    uint_t  cnt;
    int n, j, shft;
    uint_t nindirblks;

    int fsbshift = fs->fs_bshift;
    int fsboffset = (1 << fsbshift) - 1;

    dblks = (ip->i_size + fsboffset) >> fsbshift;
    mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;

    /*
     * File has only direct blocks.
     */
    if (dblks <= NDADDR)
        return (mblks < dblks);

    nindirshift = ip->i_ufsvfs->vfs_nindirshift;
    nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
    nindirblks = nindiroffset + 1;

    dblks -= NDADDR;
    shft = 0;
    /*
     * Determine how many levels of indirection.
     */
    for (j = NIADDR; j > 0; j--) {
        longlong_t  sh;

        shft += nindirshift;    /* sh *= nindir */
        sh = 1LL << shft;
        if (dblks <= sh)
            break;
        dblks -= sh;
    }
    /* LINTED: warning: logical expression always true: op "||" */
    ASSERT(NIADDR <= 3);
    ASSERT(j <= NIADDR);
    if (j == NIADDR)    /* single level indirection */
        cnt = NDADDR + 1 + dblks;
    else if (j == NIADDR-1) /* double indirection */
        cnt = NDADDR + 1 + nindirblks +
            1 + (dblks + nindiroffset)/nindirblks + dblks;
    else if (j == NIADDR-2) { /* triple indirection */
        n = (dblks + nindiroffset)/nindirblks;
        cnt = NDADDR + 1 + nindirblks +
            1 + nindirblks + nindirblks*nindirblks +
            1 + (n + nindiroffset)/nindirblks + n + dblks;
    }

    return (mblks < cnt);
}

/*
 * find some contig blocks starting at *sbp and going for min(n, max_contig)
 * return the number of blocks (not frags) found.
 * The array passed in must be at least [0..n-1].
 */
static int
findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
{
    register daddr_t bn, nextbn;
    register daddr32_t *bp;
    register int diff;
    int maxtransblk;

    if (n <= 0)
        return (0);
    bn = *sbp;
    if (bn == 0)
        return (0);
    diff = fs->fs_frag;
    if (*lenp) {
        n = MIN(n, lblkno(fs, *lenp));
    } else {
        /*
         * If the user has set the value for maxcontig lower than
         * the drive transfer size, then assume they want this
         * to be the maximum value for the size of the data transfer.
         */
        maxtransblk = maxtransfer >> DEV_BSHIFT;
        if (fs->fs_maxcontig < maxtransblk) {
            n = MIN(n, fs->fs_maxcontig);
        } else {
            n = MIN(n, maxtransblk);
        }
    }
    bp = sbp;
    while (--n > 0) {
        nextbn = *(bp + 1);
        if (nextbn == 0 || bn + diff != nextbn)
            break;
        bn = nextbn;
        bp++;
    }
    return ((int)(bp - sbp) + 1);
}

/*
 * Free any blocks which had been successfully allocated.  Always called
 * as a result of an error, so we don't bother returning an error code
 * from here.
 *
 * If block_count and inode_sector_adjust are both zero, we'll do nothing.
 * Thus it is safe to call this as part of error handling, whether or not
 * any blocks have been allocated.
 *
 * The ufs_inode_direct case is currently unused.
 */

static void
ufs_undo_allocation(
    inode_t *ip,
    int block_count,
    struct ufs_allocated_block table[],
    int inode_sector_adjust)
{
    int i;
    int inode_changed;
    int error_updating_pointers;
    struct ufsvfs *ufsvfsp;

    inode_changed = 0;
    error_updating_pointers = 0;

    ufsvfsp = ip->i_ufsvfs;

    /*
     * Update pointers on disk before freeing blocks.  If we fail,
     * some blocks may remain busy; but they will be reclaimed by
     * an fsck.  (This is better than letting a block wind up with
     * two owners if we successfully freed it but could not remove
     * the pointer to it.)
     */

    for (i = 0; i < block_count; i++) {
        switch (table[i].owner) {
        case ufs_no_owner:
            /* Nothing to do here, nobody points to us */
            break;
        case ufs_inode_direct:
            ASSERT(table[i].owner_offset < NDADDR);
            ip->i_db[table[i].owner_offset] = 0;
            inode_changed = 1;
            break;
        case ufs_inode_indirect:
            ASSERT(table[i].owner_offset < NIADDR);
            ip->i_ib[table[i].owner_offset] = 0;
            inode_changed = 1;
            break;
        case ufs_indirect_block: {
            buf_t *bp;
            daddr32_t *block_data;

            /* Read/modify/log/write. */

            ASSERT(table[i].owner_offset <
                (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));

            bp = UFS_BREAD(ufsvfsp, ip->i_dev,
                fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
                VBSIZE(ITOV(ip)));

            if (bp->b_flags & B_ERROR) {
                /* Couldn't read this block; give up. */
                error_updating_pointers = 1;
                brelse(bp);
                break;      /* out of SWITCH */
            }

            block_data = bp->b_un.b_daddr;
            block_data[table[i].owner_offset] = 0;

            /* Write a log entry which includes the zero. */
            /* It might be possible to optimize this by using */
            /* TRANS_BUF directly and zeroing only the four */
            /* bytes involved, but an attempt to do that led */
            /* to panics in the logging code.  The attempt was */
            /* TRANS_BUF(ufsvfsp,                 */
            /*    table[i].owner_offset * sizeof (daddr32_t), */
            /*    sizeof (daddr32_t),             */
            /*    bp,                     */
            /*    DT_ABZERO);                 */

            TRANS_BUF_ITEM_128(ufsvfsp,
                block_data[table[i].owner_offset],
                block_data, bp, DT_AB);

            /* Now we can write the buffer itself. */

            UFS_BWRITE2(ufsvfsp, bp);

            if (bp->b_flags & B_ERROR) {
                error_updating_pointers = 1;
            }

            brelse(bp);
            break;
        }
        default:
            (void) ufs_fault(ITOV(ip),
                "ufs_undo_allocation failure\n");
            break;
        }
    }

    /*
     * If the inode changed, or if we need to update its block count,
     * then do that now.  We update the inode synchronously on disk
     * to ensure that it won't transiently point at a block we've
     * freed (only necessary if we're not logging).
     *
     * NOTE: Currently ufs_iupdat() does not check for errors.  When
     * it is fixed, we should verify that we successfully updated the
     * inode before freeing blocks below.
     */

    if (inode_changed || (inode_sector_adjust != 0)) {
        ip->i_blocks -= inode_sector_adjust;
        ASSERT((unsigned)ip->i_blocks <= INT_MAX);
        TRANS_INODE(ufsvfsp, ip);
        ip->i_flag |= IUPD | ICHG | IATTCHG;
        ip->i_seq++;
        if (!TRANS_ISTRANS(ufsvfsp))
            ufs_iupdat(ip, I_SYNC);
    }

    /*
     * Now we go through and actually free the blocks, but only if we
     * successfully removed the pointers to them.
     */

    if (!error_updating_pointers) {
        for (i = 0; i < block_count; i++) {
            free(ip, table[i].this_block, table[i].block_size,
                table[i].usage_flags);
        }
    }
}

/*
 * Find the next hole or data block in file starting at *off
 * Return found offset in *off.
 * This code is based on bmap_read().
 * Errors: ENXIO for end of file
 *         EIO for block read error.
 */
int
bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
{
    ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
    struct fs *fs = ufsvfsp->vfs_fs;
    buf_t *bp[NIADDR];
    int i, j;
    int shft;           /* we maintain sh = 1 << shft */
    int nindirshift, nindiroffset;
    daddr_t ob, nb, tbn, lbn, skip;
    daddr32_t *bap;
    u_offset_t isz = (offset_t)ip->i_size;
    int32_t bs = fs->fs_bsize; /* file system block size */
    int32_t nindir = fs->fs_nindir;
    dev_t dev;
    int error = 0;
    daddr_t limits[NIADDR];

    ASSERT(*off < isz);
    ASSERT(RW_LOCK_HELD(&ip->i_contents));
    ASSERT(blkoff(fs, *off) == 0);
    lbn = (daddr_t)lblkno(fs, *off);
    ASSERT(lbn >= 0);

    for (i = 0; i < NIADDR; i++)
        bp[i] = NULL;

    /*
     * The first NDADDR blocks are direct blocks.
     */
    if (lbn < NDADDR) {
        for (; lbn < NDADDR; lbn++) {
            if ((hole && (ip->i_db[lbn] == 0)) ||
                (!hole && (ip->i_db[lbn] != 0))) {
                goto out;
            }
        }
        if ((u_offset_t)lbn << fs->fs_bshift >= isz)
            goto out;
    }

    nindir = fs->fs_nindir;
    nindirshift = ufsvfsp->vfs_nindirshift;
    nindiroffset = ufsvfsp->vfs_nindiroffset;
    dev = ip->i_dev;

    /* Set up limits array */
    for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
        limits[j] = limits[j-1] + (1ULL << (nindirshift * j));

loop:
    /*
     * Determine how many levels of indirection.
     */
    shft = 0;               /* sh = 1 */
    tbn = lbn - NDADDR;
    for (j = NIADDR; j > 0; j--) {
        longlong_t sh;

        shft += nindirshift;        /* sh *= nindir */
        sh = 1LL << shft;
        if (tbn < sh)
            break;
        tbn -= sh;
    }
    if (j == 0) {
        /* must have passed end of file */
        ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
        goto out;
    }

    /*
     * Fetch the first indirect block.
     */
    nb = ip->i_ib[NIADDR - j];
    if (nb == 0) {
        if (hole) {
            lbn = limits[NIADDR - j];
            goto out;
        } else {
            lbn = limits[NIADDR - j + 1];
            if ((u_offset_t)lbn << fs->fs_bshift >= isz)
                goto out;
            goto loop;
        }
    }

    /*
     * Fetch through the indirect blocks.
     */
    for (; ((j <= NIADDR) && (nb != 0)); j++) {
        ob = nb;
        /*
         * if there's a different block at this level then release
         * the old one and in with the new.
         */
        if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
            if (bp[j-1] != NULL)
                brelse(bp[j-1]);
            bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
            if (bp[j-1]->b_flags & B_ERROR) {
                error = EIO;
                goto out;
            }
        }
        bap = bp[j-1]->b_un.b_daddr;

        shft -= nindirshift;        /* sh / nindir */
        i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
        nb = bap[i];
        skip = 1LL << (nindirshift * (NIADDR - j));
    }

    /*
     * Scan through the blocks in this array.
     */
    for (; i < nindir; i++, lbn += skip) {
        if (hole && (bap[i] == 0))
            goto out;
        if (!hole && (bap[i] != 0)) {
            if (skip == 1) {
                /* we're at the lowest level */
                goto out;
            } else {
                goto loop;
            }
        }
    }
    if (((u_offset_t)lbn << fs->fs_bshift) < isz)
        goto loop;
out:
    for (i = 0; i < NIADDR; i++) {
        if (bp[i])
            brelse(bp[i]);
    }
    if (error == 0) {
        if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
            error = ENXIO;
        } else {
            /* success */
            *off = (u_offset_t)lbn << fs->fs_bshift;
        }
    }
    return (error);
}