fs/udfs/udf_vnops.c

	udf_vnops.c revision aa59c4cb15a6ac5d4e585dadf7a055b580abf579
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vfs_opreg.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/dnlc.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/mman.h>
#include <sys/fbuf.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/modctl.h>
#include <sys/statvfs.h>
#include <sys/mount.h>
#include <sys/sunddi.h>
#include <sys/bootconf.h>
#include <sys/policy.h>

#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/page.h>
#include <sys/swap.h>

#include <fs/fs_subr.h>

#include <sys/fs/udf_volume.h>
#include <sys/fs/udf_inode.h>

static int32_t udf_open(struct vnode **,
    int32_t, struct cred *);
static int32_t udf_close(struct vnode *,
    int32_t, int32_t, offset_t, struct cred *);
static int32_t udf_read(struct vnode *,
    struct uio *, int32_t, struct cred *, struct caller_context *);
static int32_t udf_write(struct vnode *,
    struct uio *, int32_t, struct cred *, struct caller_context *);
static int32_t udf_ioctl(struct vnode *,
    int32_t, intptr_t, int32_t, struct cred *, int32_t *);
static int32_t udf_getattr(struct vnode *,
    struct vattr *, int32_t, struct cred *);
static int32_t udf_setattr(struct vnode *,
    struct vattr *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_access(struct vnode *,
    int32_t, int32_t, struct cred *);
static int32_t udf_lookup(struct vnode *,
    char *, struct vnode **, struct pathname *,
    int32_t, struct vnode *, struct cred *);
static int32_t udf_create(struct vnode *,
    char *, struct vattr *, enum vcexcl,
    int32_t, struct vnode **, struct cred *, int32_t);
static int32_t udf_remove(struct vnode *,
    char *, struct cred *);
static int32_t udf_link(struct vnode *,
    struct vnode *, char *, struct cred *);
static int32_t udf_rename(struct vnode *,
    char *, struct vnode *, char *, struct cred *);
static int32_t udf_mkdir(struct vnode *,
    char *, struct vattr *, struct vnode **, struct cred *);
static int32_t udf_rmdir(struct vnode *,
    char *, struct vnode *, struct cred *);
static int32_t udf_readdir(struct vnode *,
    struct uio *, struct cred *, int32_t *);
static int32_t udf_symlink(struct vnode *,
    char *, struct vattr *, char *, struct cred *);
static int32_t udf_readlink(struct vnode *,
    struct uio *, struct cred *);
static int32_t udf_fsync(struct vnode *,
    int32_t, struct cred *);
static void udf_inactive(struct vnode *,
    struct cred *);
static int32_t udf_fid(struct vnode *, struct fid *);
static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
static int32_t udf_seek(struct vnode *, offset_t, offset_t *);
static int32_t udf_frlock(struct vnode *, int32_t,
    struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *);
static int32_t udf_space(struct vnode *, int32_t,
    struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
static int32_t udf_getpage(struct vnode *, offset_t,
    size_t, uint32_t *, struct page **, size_t,
    struct seg *, caddr_t, enum seg_rw, struct cred *);
static int32_t udf_putpage(struct vnode *, offset_t,
    size_t, int32_t, struct cred *);
static int32_t udf_map(struct vnode *, offset_t, struct as *,
    caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *);
static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
    caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *);
static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
    caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *);
static int32_t udf_l_pathconf(struct vnode *, int32_t,
    ulong_t *, struct cred *);
static int32_t udf_pageio(struct vnode *, struct page *,
    u_offset_t, size_t, int32_t, struct cred *);

int32_t ud_getpage_miss(struct vnode *, u_offset_t,
    size_t, struct seg *, caddr_t, page_t *pl[],
    size_t, enum seg_rw, int32_t);
void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
int32_t ud_page_fill(struct ud_inode *, page_t *,
    u_offset_t, uint32_t, u_offset_t *);
int32_t ud_iodone(struct buf *);
int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
int32_t ud_slave_done(struct buf *);

/*
 * Structures to control multiple IO operations to get or put pages
 * that are backed by discontiguous blocks. The master struct is
 * a dummy that holds the original bp from pageio_setup. The
 * slave struct holds the working bp's to do the actual IO. Once
 * all the slave IOs complete. The master is processed as if a single
 * IO op has completed.
 */
uint32_t master_index = 0;
typedef struct mio_master {
    kmutex_t    mm_mutex;   /* protect the fields below */
    int32_t     mm_size;
    buf_t       *mm_bp;     /* original bp */
    int32_t     mm_resid;   /* bytes remaining to transfer */
    int32_t     mm_error;   /* accumulated error from slaves */
    int32_t     mm_index;   /* XXX debugging */
} mio_master_t;

typedef struct mio_slave {
    buf_t       ms_buf;     /* working buffer for this IO chunk */
    mio_master_t    *ms_ptr;    /* pointer to master */
} mio_slave_t;

struct vnodeops *udf_vnodeops;

const fs_operation_def_t udf_vnodeops_template[] = {
    VOPNAME_OPEN,       { .vop_open = udf_open },
    VOPNAME_CLOSE,      { .vop_close = udf_close },
    VOPNAME_READ,       { .vop_read = udf_read },
    VOPNAME_WRITE,      { .vop_write = udf_write },
    VOPNAME_IOCTL,      { .vop_ioctl = udf_ioctl },
    VOPNAME_GETATTR,    { .vop_getattr = udf_getattr },
    VOPNAME_SETATTR,    { .vop_setattr = udf_setattr },
    VOPNAME_ACCESS,     { .vop_access = udf_access },
    VOPNAME_LOOKUP,     { .vop_lookup = udf_lookup },
    VOPNAME_CREATE,     { .vop_create = udf_create },
    VOPNAME_REMOVE,     { .vop_remove = udf_remove },
    VOPNAME_LINK,       { .vop_link = udf_link },
    VOPNAME_RENAME,     { .vop_rename = udf_rename },
    VOPNAME_MKDIR,      { .vop_mkdir = udf_mkdir },
    VOPNAME_RMDIR,      { .vop_rmdir = udf_rmdir },
    VOPNAME_READDIR,    { .vop_readdir = udf_readdir },
    VOPNAME_SYMLINK,    { .vop_symlink = udf_symlink },
    VOPNAME_READLINK,   { .vop_readlink = udf_readlink },
    VOPNAME_FSYNC,      { .vop_fsync = udf_fsync },
    VOPNAME_INACTIVE,   { .vop_inactive = udf_inactive },
    VOPNAME_FID,        { .vop_fid = udf_fid },
    VOPNAME_RWLOCK,     { .vop_rwlock = udf_rwlock },
    VOPNAME_RWUNLOCK,   { .vop_rwunlock = udf_rwunlock },
    VOPNAME_SEEK,       { .vop_seek = udf_seek },
    VOPNAME_FRLOCK,     { .vop_frlock = udf_frlock },
    VOPNAME_SPACE,      { .vop_space = udf_space },
    VOPNAME_GETPAGE,    { .vop_getpage = udf_getpage },
    VOPNAME_PUTPAGE,    { .vop_putpage = udf_putpage },
    VOPNAME_MAP,        { .vop_map = udf_map },
    VOPNAME_ADDMAP,     { .vop_addmap = udf_addmap },
    VOPNAME_DELMAP,     { .vop_delmap = udf_delmap },
    VOPNAME_PATHCONF,   { .vop_pathconf = udf_l_pathconf },
    VOPNAME_PAGEIO,     { .vop_pageio = udf_pageio },
    VOPNAME_VNEVENT,    { .vop_vnevent = fs_vnevent_support },
    NULL,           NULL
};

/* ARGSUSED */
static int32_t
udf_open(struct vnode **vpp, int32_t flag, struct cred *cr)
{
    ud_printf("udf_open\n");

    return (0);
}

/* ARGSUSED */
static int32_t
udf_close(struct vnode *vp, int32_t flag,
    int32_t count, offset_t offset, struct cred *cr)
{
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_close\n");

    ITIMES(ip);

    cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    cleanshares(vp, ttoproc(curthread)->p_pid);

    /*
     * Push partially filled cluster at last close.
     * ``last close'' is approximated because the dnlc
     * may have a hold on the vnode.
     */
    if (vp->v_count <= 2 && vp->v_type != VBAD) {
        struct ud_inode *ip = VTOI(vp);
        if (ip->i_delaylen) {
            (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
                    B_ASYNC | B_FREE, cr);
            ip->i_delaylen = 0;
        }
    }

    return (0);
}

static int32_t
udf_read(struct vnode *vp, struct uio *uiop,
    int32_t ioflag, struct cred *cr, struct caller_context *ct)
{
    struct ud_inode *ip = VTOI(vp);
    int32_t error;

    ud_printf("udf_read\n");

#ifdef  __lock_lint
    rw_enter(&ip->i_rwlock, RW_READER);
#endif

    ASSERT(RW_READ_HELD(&ip->i_rwlock));

    if (MANDLOCK(vp, ip->i_char)) {
        /*
         * udf_getattr ends up being called by chklock
         */
        error = chklock(vp, FREAD, uiop->uio_loffset,
            uiop->uio_resid, uiop->uio_fmode, ct);
        if (error) {
            goto end;
        }
    }

    rw_enter(&ip->i_contents, RW_READER);
    error = ud_rdip(ip, uiop, ioflag, cr);
    rw_exit(&ip->i_contents);

end:
#ifdef  __lock_lint
    rw_exit(&ip->i_rwlock);
#endif

    return (error);
}


int32_t ud_WRITES = 1;
int32_t ud_HW = 96 * 1024;
int32_t ud_LW = 64 * 1024;
int32_t ud_throttles = 0;

static int32_t
udf_write(struct vnode *vp, struct uio *uiop,
    int32_t ioflag, struct cred *cr, struct caller_context *ct)
{
    struct ud_inode *ip = VTOI(vp);
    int32_t error = 0;

    ud_printf("udf_write\n");

#ifdef  __lock_lint
    rw_enter(&ip->i_rwlock, RW_WRITER);
#endif

    ASSERT(RW_WRITE_HELD(&ip->i_rwlock));

    if (MANDLOCK(vp, ip->i_char)) {
        /*
         * ud_getattr ends up being called by chklock
         */
        error = chklock(vp, FWRITE, uiop->uio_loffset,
            uiop->uio_resid, uiop->uio_fmode, ct);
        if (error) {
            goto end;
        }
    }
    /*
     * Throttle writes.
     */
    mutex_enter(&ip->i_tlock);
    if (ud_WRITES && (ip->i_writes > ud_HW)) {
        while (ip->i_writes > ud_HW) {
            ud_throttles++;
            cv_wait(&ip->i_wrcv, &ip->i_tlock);
        }
    }
    mutex_exit(&ip->i_tlock);

    /*
     * Write to the file
     */
    rw_enter(&ip->i_contents, RW_WRITER);
    if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
        /*
         * In append mode start at end of file.
         */
        uiop->uio_loffset = ip->i_size;
    }
    error = ud_wrip(ip, uiop, ioflag, cr);
    rw_exit(&ip->i_contents);

end:
#ifdef  __lock_lint
    rw_exit(&ip->i_rwlock);
#endif

    return (error);
}

/* ARGSUSED */
static int32_t
udf_ioctl(struct vnode *vp, int32_t cmd, intptr_t arg,
    int32_t flag, struct cred *cr, int32_t *rvalp)
{
    return (ENOTTY);
}

/* ARGSUSED */
static int32_t
udf_getattr(struct vnode *vp,
    struct vattr *vap, int32_t flags, struct cred *cr)
{
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_getattr\n");

    if (vap->va_mask == AT_SIZE) {
        /*
         * for performance, if only the size is requested don't bother
         * with anything else.
         */
        vap->va_size = ip->i_size;
        return (0);
    }

    rw_enter(&ip->i_contents, RW_READER);

    vap->va_type = vp->v_type;
    vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;

    vap->va_uid = ip->i_uid;
    vap->va_gid = ip->i_gid;
    vap->va_fsid = ip->i_dev;
    vap->va_nodeid = ip->i_icb_lbano;
    vap->va_nlink = ip->i_nlink;
    vap->va_size = ip->i_size;
    vap->va_seq = ip->i_seq;
    if (vp->v_type == VCHR || vp->v_type == VBLK) {
        vap->va_rdev = ip->i_rdev;
    } else {
        vap->va_rdev = 0;
    }

    mutex_enter(&ip->i_tlock);
    ITIMES_NOLOCK(ip);  /* mark correct time in inode */
    vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
    vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
    vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
    vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
    vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
    vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
    mutex_exit(&ip->i_tlock);

    switch (ip->i_type) {
        case VBLK:
            vap->va_blksize = MAXBSIZE;
            break;
        case VCHR:
            vap->va_blksize = MAXBSIZE;
            break;
        default:
            vap->va_blksize = ip->i_udf->udf_lbsize;
            break;
    }
    vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;

    rw_exit(&ip->i_contents);

    return (0);
}

static int
ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
{
    return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr));
}

/*ARGSUSED4*/
static int32_t
udf_setattr(
    struct vnode *vp,
    struct vattr *vap,
    int32_t flags,
    struct cred *cr,
    caller_context_t *ct)
{
    int32_t error = 0;
    uint32_t mask = vap->va_mask;
    struct ud_inode *ip;
    timestruc_t now;
    struct vattr ovap;

    ud_printf("udf_setattr\n");

    ip = VTOI(vp);

    /*
     * not updates allowed to 4096 files
     */
    if (ip->i_astrat == STRAT_TYPE4096) {
        return (EINVAL);
    }

    /*
     * Cannot set these attributes
     */
    if (mask & AT_NOSET) {
        return (EINVAL);
    }

    rw_enter(&ip->i_rwlock, RW_WRITER);
    rw_enter(&ip->i_contents, RW_WRITER);

    ovap.va_uid = ip->i_uid;
    ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
    error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
                        ud_iaccess_vmode, ip);
    if (error)
        goto update_inode;

    mask = vap->va_mask;
    /*
     * Change file access modes.
     */
    if (mask & AT_MODE) {
        ip->i_perm = VA2UD_PERM(vap->va_mode);
        ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
        mutex_enter(&ip->i_tlock);
        ip->i_flag |= ICHG;
        mutex_exit(&ip->i_tlock);
    }
    if (mask & (AT_UID|AT_GID)) {
        if (mask & AT_UID) {
            ip->i_uid = vap->va_uid;
        }
        if (mask & AT_GID) {
            ip->i_gid = vap->va_gid;
        }
        mutex_enter(&ip->i_tlock);
        ip->i_flag |= ICHG;
        mutex_exit(&ip->i_tlock);
    }
    /*
     * Truncate file.  Must have write permission and not be a directory.
     */
    if (mask & AT_SIZE) {
        if (vp->v_type == VDIR) {
            error = EISDIR;
            goto update_inode;
        }
        if (error = ud_iaccess(ip, IWRITE, cr)) {
            goto update_inode;
        }
        if (vap->va_size > MAXOFFSET_T) {
            error = EFBIG;
            goto update_inode;
        }
        if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
            goto update_inode;
        }
    }
    /*
     * Change file access or modified times.
     */
    if (mask & (AT_ATIME|AT_MTIME)) {
        mutex_enter(&ip->i_tlock);
        if (mask & AT_ATIME) {
            ip->i_atime.tv_sec = vap->va_atime.tv_sec;
            ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
            ip->i_flag &= ~IACC;
        }
        if (mask & AT_MTIME) {
            ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
            ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
            gethrestime(&now);
            ip->i_ctime.tv_sec = now.tv_sec;
            ip->i_ctime.tv_nsec = now.tv_nsec;
            ip->i_flag &= ~(IUPD|ICHG);
            ip->i_flag |= IMODTIME;
        }
        ip->i_flag |= IMOD;
        mutex_exit(&ip->i_tlock);
    }

update_inode:
    if (curthread->t_flag & T_DONTPEND) {
        ud_iupdat(ip, 1);
    } else {
        ITIMES_NOLOCK(ip);
    }
    rw_exit(&ip->i_contents);
    rw_exit(&ip->i_rwlock);

    return (error);
}

/* ARGSUSED */
static int32_t
udf_access(struct vnode *vp,
    int32_t mode, int32_t flags, struct cred *cr)
{
    struct ud_inode *ip = VTOI(vp);
    int32_t error;

    ud_printf("udf_access\n");

    if (ip->i_udf == NULL) {
        return (EIO);
    }

    error = ud_iaccess(ip, UD_UPERM2DPERM(mode), cr);

    return (error);
}

int32_t udfs_stickyhack = 1;

/* ARGSUSED */
static int32_t
udf_lookup(struct vnode *dvp,
    char *nm, struct vnode **vpp, struct pathname *pnp,
    int32_t flags, struct vnode *rdir, struct cred *cr)
{
    int32_t error;
    struct vnode *vp;
    struct ud_inode *ip, *xip;

    ud_printf("udf_lookup\n");
    /*
     * Null component name is a synonym for directory being searched.
     */
    if (*nm == '\0') {
        VN_HOLD(dvp);
        *vpp = dvp;
        error = 0;
        goto out;
    }

    /*
     * Fast path: Check the directory name lookup cache.
     */
    ip = VTOI(dvp);
    if (vp = dnlc_lookup(dvp, nm)) {
        /*
         * Check accessibility of directory.
         */
        if ((error = ud_iaccess(ip, IEXEC, cr)) != 0) {
            VN_RELE(vp);
        }
        xip = VTOI(vp);
    } else {
        error = ud_dirlook(ip, nm, &xip, cr, 1);
        ITIMES(ip);
    }

    if (error == 0) {
        ip = xip;
        *vpp = ITOV(ip);
        if ((ip->i_type != VDIR) &&
            (ip->i_char & ISVTX) &&
            ((ip->i_perm & IEXEC) == 0) &&
            udfs_stickyhack) {
            mutex_enter(&(*vpp)->v_lock);
            (*vpp)->v_flag |= VISSWAP;
            mutex_exit(&(*vpp)->v_lock);
        }
        ITIMES(ip);
        /*
         * If vnode is a device return special vnode instead.
         */
        if (IS_DEVVP(*vpp)) {
            struct vnode *newvp;
            newvp = specvp(*vpp, (*vpp)->v_rdev,
                    (*vpp)->v_type, cr);
            VN_RELE(*vpp);
            if (newvp == NULL) {
                error = ENOSYS;
            } else {
                *vpp = newvp;
            }
        }
    }
out:
    return (error);
}

/* ARGSUSED */
static int32_t
udf_create(struct vnode *dvp,
    char *name, struct vattr *vap, enum vcexcl excl,
    int32_t mode, struct vnode **vpp, struct cred *cr, int32_t flag)
{
    int32_t error;
    struct ud_inode *ip = VTOI(dvp), *xip;

    ud_printf("udf_create\n");

    if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
        vap->va_mode &= ~VSVTX;

    if (*name == '\0') {
        /*
         * Null component name refers to the directory itself.
         */
        VN_HOLD(dvp);
        ITIMES(ip);
        error = EEXIST;
    } else {
        xip = NULL;
        rw_enter(&ip->i_rwlock, RW_WRITER);
        error = ud_direnter(ip, name, DE_CREATE,
                (struct ud_inode *)0, (struct ud_inode *)0,
                vap, &xip, cr);
        rw_exit(&ip->i_rwlock);
        ITIMES(ip);
        ip = xip;
    }
#ifdef  __lock_lint
    rw_enter(&ip->i_contents, RW_WRITER);
#else
    if (ip != NULL) {
        rw_enter(&ip->i_contents, RW_WRITER);
    }
#endif

    /*
     * If the file already exists and this is a non-exclusive create,
     * check permissions and allow access for non-directories.
     * Read-only create of an existing directory is also allowed.
     * We fail an exclusive create of anything which already exists.
     */
    if (error == EEXIST) {
        if (excl == NONEXCL) {
            if ((ip->i_type == VDIR) && (mode & VWRITE)) {
                error = EISDIR;
            } else if (mode) {
                error = ud_iaccess(ip,
                    UD_UPERM2DPERM(mode), cr);
            } else {
                error = 0;
            }
        }
        if (error) {
            rw_exit(&ip->i_contents);
            VN_RELE(ITOV(ip));
            goto out;
        } else if ((ip->i_type == VREG) &&
            (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
            /*
             * Truncate regular files, if requested by caller.
             * Grab i_rwlock to make sure no one else is
             * currently writing to the file (we promised
             * bmap we would do this).
             * Must get the locks in the correct order.
             */
            if (ip->i_size == 0) {
                ip->i_flag |= ICHG | IUPD;
            } else {
                rw_exit(&ip->i_contents);
                rw_enter(&ip->i_rwlock, RW_WRITER);
                rw_enter(&ip->i_contents, RW_WRITER);
                (void) ud_itrunc(ip, 0, 0, cr);
                rw_exit(&ip->i_rwlock);
            }
        }
    }

    if (error == 0) {
        *vpp = ITOV(ip);
        ITIMES(ip);
    }
#ifdef  __lock_lint
    rw_exit(&ip->i_contents);
#else
    if (ip != NULL) {
        rw_exit(&ip->i_contents);
    }
#endif
    if (error) {
        goto out;
    }

    /*
     * If vnode is a device return special vnode instead.
     */
    if (!error && IS_DEVVP(*vpp)) {
        struct vnode *newvp;

        newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
        VN_RELE(*vpp);
        if (newvp == NULL) {
            error = ENOSYS;
            goto out;
        }
        *vpp = newvp;
    }
out:
    return (error);
}

static int32_t
udf_remove(struct vnode *vp, char *nm, struct cred *cr)
{
    int32_t error;
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_remove\n");

    rw_enter(&ip->i_rwlock, RW_WRITER);
    error = ud_dirremove(ip, nm,
        (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr);
    rw_exit(&ip->i_rwlock);
    ITIMES(ip);

    return (error);
}

static int32_t
udf_link(struct vnode *tdvp,
    struct vnode *svp, char *tnm, struct cred *cr)
{
    int32_t error;
    struct vnode *realvp;
    struct ud_inode *sip;
    struct ud_inode *tdp;

    ud_printf("udf_link\n");
    if (VOP_REALVP(svp, &realvp) == 0) {
        svp = realvp;
    }

    /*
     * Do not allow links to directories
     */
    if (svp->v_type == VDIR) {
        return (EPERM);
    }

    sip = VTOI(svp);

    if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
        return (EPERM);

    tdp = VTOI(tdvp);

    rw_enter(&tdp->i_rwlock, RW_WRITER);
    error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
        sip, (struct vattr *)0, (struct ud_inode **)0, cr);
    rw_exit(&tdp->i_rwlock);
    ITIMES(sip);
    ITIMES(tdp);

    return (error);
}

/* ARGSUSED */
static int32_t
udf_rename(struct vnode *sdvp,
    char *snm, struct vnode *tdvp,
    char *tnm, struct cred *cr)
{
    int32_t error = 0;
    struct udf_vfs *udf_vfsp;
    struct ud_inode *sip;       /* source inode */
    struct ud_inode *sdp, *tdp; /* source and target parent inode */
    struct vnode *realvp;

    ud_printf("udf_rename\n");

    if (VOP_REALVP(tdvp, &realvp) == 0) {
        tdvp = realvp;
    }

    sdp = VTOI(sdvp);
    tdp = VTOI(tdvp);

    udf_vfsp = sdp->i_udf;

    mutex_enter(&udf_vfsp->udf_rename_lck);
    /*
     * Look up inode of file we're supposed to rename.
     */
    if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
        mutex_exit(&udf_vfsp->udf_rename_lck);
        return (error);
    }
    /*
     * be sure this is not a directory with another file system mounted
     * over it.  If it is just give up the locks, and return with
     * EBUSY
     */
    if (vn_mountedvfs(ITOV(sip)) != NULL) {
        error = EBUSY;
        goto errout;
    }
    /*
     * Make sure we can delete the source entry.  This requires
     * write permission on the containing directory.  If that
     * directory is "sticky" it further requires (except for
     * privileged users) that the user own the directory or the
     * source entry, or else have permission to write the source
     * entry.
     */
    rw_enter(&sdp->i_contents, RW_READER);
    rw_enter(&sip->i_contents, RW_READER);
    if ((error = ud_iaccess(sdp, IWRITE, cr)) != 0 ||
        (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
        rw_exit(&sip->i_contents);
        rw_exit(&sdp->i_contents);
        ITIMES(sip);
        goto errout;
    }

    /*
     * Check for renaming '.' or '..' or alias of '.'
     */
    if ((strcmp(snm, ".") == 0) ||
            (strcmp(snm, "..") == 0) ||
            (sdp == sip)) {
        error = EINVAL;
        rw_exit(&sip->i_contents);
        rw_exit(&sdp->i_contents);
        goto errout;
    }
    rw_exit(&sip->i_contents);
    rw_exit(&sdp->i_contents);


    /*
     * Link source to the target.
     */
    rw_enter(&tdp->i_rwlock, RW_WRITER);
    if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
            (struct vattr *)0, (struct ud_inode **)0, cr)) {
        /*
         * ESAME isn't really an error; it indicates that the
         * operation should not be done because the source and target
         * are the same file, but that no error should be reported.
         */
        if (error == ESAME) {
            error = 0;
        }
        rw_exit(&tdp->i_rwlock);
        goto errout;
    }
    rw_exit(&tdp->i_rwlock);

    rw_enter(&sdp->i_rwlock, RW_WRITER);
    /*
     * Unlink the source.
     * Remove the source entry.  ud_dirremove() checks that the entry
     * still reflects sip, and returns an error if it doesn't.
     * If the entry has changed just forget about it.  Release
     * the source inode.
     */
    if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
            DR_RENAME, cr)) == ENOENT) {
        error = 0;
    }
    rw_exit(&sdp->i_rwlock);
errout:
    ITIMES(sdp);
    ITIMES(tdp);
    VN_RELE(ITOV(sip));
    mutex_exit(&udf_vfsp->udf_rename_lck);

    return (error);
}

static int32_t
udf_mkdir(struct vnode *dvp,
    char *dirname, struct vattr *vap,
    struct vnode **vpp, struct cred *cr)
{
    int32_t error;
    struct ud_inode *ip;
    struct ud_inode *xip;

    ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));

    ud_printf("udf_mkdir\n");

    ip = VTOI(dvp);
    rw_enter(&ip->i_rwlock, RW_WRITER);
    error = ud_direnter(ip, dirname, DE_MKDIR,
        (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr);
    rw_exit(&ip->i_rwlock);
    ITIMES(ip);
    if (error == 0) {
        ip = xip;
        *vpp = ITOV(ip);
        ITIMES(ip);
    } else if (error == EEXIST) {
        ITIMES(xip);
        VN_RELE(ITOV(xip));
    }

    return (error);
}

static int32_t
udf_rmdir(struct vnode *vp,
    char *nm, struct vnode *cdir, struct cred *cr)
{
    int32_t error;
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_rmdir\n");

    rw_enter(&ip->i_rwlock, RW_WRITER);
    error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR, cr);
    rw_exit(&ip->i_rwlock);
    ITIMES(ip);

    return (error);
}

/* ARGSUSED */
static int32_t
udf_readdir(struct vnode *vp,
    struct uio *uiop, struct cred *cr, int32_t *eofp)
{
    struct ud_inode *ip;
    struct dirent64 *nd;
    struct udf_vfs *udf_vfsp;
    int32_t error = 0, len, outcount = 0;
    uint32_t dirsiz, offset;
    uint32_t bufsize, ndlen, dummy;
    caddr_t outbuf;
    caddr_t outb, end_outb;
    struct iovec *iovp;

    uint8_t *dname;
    int32_t length;

    uint8_t *buf = NULL;

    struct fbuf *fbp = NULL;
    struct file_id *fid;
    uint8_t *name;


    ud_printf("udf_readdir\n");

    ip = VTOI(vp);
    udf_vfsp = ip->i_udf;

    dirsiz = ip->i_size;
    if ((uiop->uio_offset >= dirsiz) ||
            (ip->i_nlink <= 0)) {
        if (eofp) {
            *eofp = 1;
        }
        return (0);
    }

    offset = uiop->uio_offset;
    iovp = uiop->uio_iov;
    bufsize = iovp->iov_len;

    outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
    end_outb = outb + bufsize;
    nd = (struct dirent64 *)outbuf;

    dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
    buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);

    if (offset == 0) {
        len = DIRENT64_RECLEN(1);
        if (((caddr_t)nd + len) >= end_outb) {
            error = EINVAL;
            goto end;
        }
        nd->d_ino = ip->i_icb_lbano;
        nd->d_reclen = (uint16_t)len;
        nd->d_off = 0x10;
        nd->d_name[0] = '.';
        bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
        nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
        outcount++;
    } else if (offset == 0x10) {
        offset = 0;
    }

    while (offset < dirsiz) {
        error = ud_get_next_fid(ip, &fbp,
                offset, &fid, &name, buf);
        if (error != 0) {
            break;
        }

        if ((fid->fid_flags & FID_DELETED) == 0) {
            if (fid->fid_flags & FID_PARENT) {

                len = DIRENT64_RECLEN(2);
                if (((caddr_t)nd + len) >= end_outb) {
                    error = EINVAL;
                    break;
                }

                nd->d_ino = ip->i_icb_lbano;
                nd->d_reclen = (uint16_t)len;
                nd->d_off = offset + FID_LEN(fid);
                nd->d_name[0] = '.';
                nd->d_name[1] = '.';
                bzero(&nd->d_name[2],
                    DIRENT64_NAMELEN(len) - 2);
                nd = (struct dirent64 *)
                    ((char *)nd + nd->d_reclen);
            } else {
                if ((error = ud_uncompress(fid->fid_idlen,
                        &length, name, dname)) != 0) {
                    break;
                }
                if (length == 0) {
                    offset += FID_LEN(fid);
                    continue;
                }
                len = DIRENT64_RECLEN(length);
                if (((caddr_t)nd + len) >= end_outb) {
                    if (!outcount) {
                        error = EINVAL;
                    }
                    break;
                }
                (void) strncpy(nd->d_name,
                    (caddr_t)dname, length);
                bzero(&nd->d_name[length],
                    DIRENT64_NAMELEN(len) - length);
                nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
                    SWAP_16(fid->fid_icb.lad_ext_prn),
                    SWAP_32(fid->fid_icb.lad_ext_loc), 1,
                    &dummy);
                nd->d_reclen = (uint16_t)len;
                nd->d_off = offset + FID_LEN(fid);
                nd = (struct dirent64 *)
                    ((char *)nd + nd->d_reclen);
            }
            outcount++;
        }

        offset += FID_LEN(fid);
    }

end:
    if (fbp != NULL) {
        fbrelse(fbp, S_OTHER);
    }
    ndlen = ((char *)nd - outbuf);
    /*
     * In case of error do not call uiomove.
     * Return the error to the caller.
     */
    if ((error == 0) && (ndlen != 0)) {
        error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
        uiop->uio_offset = offset;
    }
    kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
    kmem_free((caddr_t)dname, 1024);
    kmem_free(outbuf, (uint32_t)bufsize);
    if (eofp && error == 0) {
        *eofp = (uiop->uio_offset >= dirsiz);
    }
    return (error);
}

/* ARGSUSED */
static int32_t
udf_symlink(struct vnode *dvp,
    char *linkname, struct vattr *vap,
    char *target, struct cred *cr)
{
    int32_t error = 0, outlen;
    uint32_t ioflag = 0;
    struct ud_inode *ip, *dip = VTOI(dvp);

    struct path_comp *pc;
    int8_t *dname = NULL, *uname = NULL, *sp;

    ud_printf("udf_symlink\n");

    ip = (struct ud_inode *)0;
    vap->va_type = VLNK;
    vap->va_rdev = 0;

    rw_enter(&dip->i_rwlock, RW_WRITER);
    error = ud_direnter(dip, linkname, DE_CREATE,
        (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr);
    rw_exit(&dip->i_rwlock);
    if (error == 0) {
        dname = kmem_zalloc(1024, KM_SLEEP);
        uname = kmem_zalloc(PAGESIZE, KM_SLEEP);

        pc = (struct path_comp *)uname;
        /*
         * If the first character in target is "/"
         * then skip it and create entry for it
         */
        if (*target == '/') {
            pc->pc_type = 2;
            pc->pc_len = 0;
            pc = (struct path_comp *)(((char *)pc) + 4);
            while (*target == '/') {
                target++;
            }
        }

        while (*target != NULL) {
            sp = target;
            while ((*target != '/') && (*target != '\0')) {
                target ++;
            }
            /*
             * We got the next component of the
             * path name. Create path_comp of
             * appropriate type
             */
            if (((target - sp) == 1) && (*sp == '.')) {
                /*
                 * Dot entry.
                 */
                pc->pc_type = 4;
                pc = (struct path_comp *)(((char *)pc) + 4);
            } else if (((target - sp) == 2) &&
                (*sp == '.') && ((*(sp + 1)) == '.')) {
                /*
                 * DotDot entry.
                 */
                pc->pc_type = 3;
                pc = (struct path_comp *)(((char *)pc) + 4);
            } else {
                /*
                 * convert the user given name
                 * into appropriate form to be put
                 * on the media
                 */
                outlen = 1024;  /* set to size of dname */
                if (error = ud_compress(target - sp, &outlen,
                    (uint8_t *)sp, (uint8_t *)dname)) {
                    break;
                }
                pc->pc_type = 5;
                /* LINTED */
                pc->pc_len = outlen;
                dname[outlen] = '\0';
                (void) strcpy((char *)pc->pc_id, dname);
                pc = (struct path_comp *)
                    (((char *)pc) + 4 + outlen);
            }
            while (*target == '/') {
                target++;
            }
            if (*target == NULL) {
                break;
            }
        }

        rw_enter(&ip->i_contents, RW_WRITER);
        if (error == 0) {
            ioflag = FWRITE;
            if (curthread->t_flag & T_DONTPEND) {
                ioflag |= FDSYNC;
            }
            error = ud_rdwri(UIO_WRITE, ioflag, ip,
                uname, ((int8_t *)pc) - uname,
                (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
        }
        if (error) {
            ud_idrop(ip);
            rw_exit(&ip->i_contents);
            rw_enter(&dip->i_rwlock, RW_WRITER);
            (void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
                    (struct vnode *)0, DR_REMOVE, cr);
            rw_exit(&dip->i_rwlock);
            goto update_inode;
        }
        rw_exit(&ip->i_contents);
    }

    if ((error == 0) || (error == EEXIST)) {
        VN_RELE(ITOV(ip));
    }

update_inode:
    ITIMES(VTOI(dvp));
    if (uname != NULL) {
        kmem_free(uname, PAGESIZE);
    }
    if (dname != NULL) {
        kmem_free(dname, 1024);
    }

    return (error);
}

/* ARGSUSED */
static int32_t
udf_readlink(struct vnode *vp,
    struct uio *uiop, struct cred *cr)
{
    int32_t error = 0, off, id_len, size, len;
    int8_t *dname = NULL, *uname = NULL;
    struct ud_inode *ip;
    struct fbuf *fbp = NULL;
    struct path_comp *pc;

    ud_printf("udf_readlink\n");

    if (vp->v_type != VLNK) {
        return (EINVAL);
    }

    ip = VTOI(vp);
    size = ip->i_size;
    if (size > PAGESIZE) {
        return (EIO);
    }

    if (size == 0) {
        return (0);
    }

    dname = kmem_zalloc(1024, KM_SLEEP);
    uname = kmem_zalloc(PAGESIZE, KM_SLEEP);

    rw_enter(&ip->i_contents, RW_READER);

    if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
        goto end;
    }

    off = 0;

    while (off < size) {
        pc = (struct path_comp *)(fbp->fb_addr + off);
        switch (pc->pc_type) {
            case 1 :
                (void) strcpy(uname, ip->i_udf->udf_fsmnt);
                (void) strcat(uname, "/");
                break;
            case 2 :
                if (pc->pc_len != 0) {
                    goto end;
                }
                uname[0] = '/';
                uname[1] = '\0';
                break;
            case 3 :
                (void) strcat(uname, "../");
                break;
            case 4 :
                (void) strcat(uname, "./");
                break;
            case 5 :
                if ((error = ud_uncompress(pc->pc_len, &id_len,
                    pc->pc_id, (uint8_t *)dname)) != 0) {
                    break;
                }
                dname[id_len] = '\0';
                (void) strcat(uname, dname);
                (void) strcat(uname, "/");
                break;
            default :
                error = EINVAL;
                goto end;
        }
        off += 4 + pc->pc_len;
    }
    len = strlen(uname) - 1;
    if (uname[len] == '/') {
        if (len == 0) {
            /*
             * special case link to /
             */
            len = 1;
        } else {
            uname[len] = '\0';
        }
    }

    error = uiomove(uname, len, UIO_READ, uiop);

    ITIMES(ip);

end:
    if (fbp != NULL) {
        fbrelse(fbp, S_OTHER);
    }
    rw_exit(&ip->i_contents);
    if (uname != NULL) {
        kmem_free(uname, PAGESIZE);
    }
    if (dname != NULL) {
        kmem_free(dname, 1024);
    }
    return (error);
}

/* ARGSUSED */
static int32_t
udf_fsync(struct vnode *vp,
    int32_t syncflag, struct cred *cr)
{
    int32_t error = 0;
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_fsync\n");

    rw_enter(&ip->i_contents, RW_WRITER);
    if (!(IS_SWAPVP(vp))) {
        error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
    }
    if (error == 0) {
        error = ud_sync_indir(ip);
    }
    ITIMES(ip);     /* XXX: is this necessary ??? */
    rw_exit(&ip->i_contents);

    return (error);
}

/* ARGSUSED */
static void
udf_inactive(struct vnode *vp, struct cred *cr)
{
    ud_printf("udf_iinactive\n");

    ud_iinactive(VTOI(vp), cr);
}

static int32_t
udf_fid(struct vnode *vp, struct fid *fidp)
{
    struct udf_fid *udfidp;
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_fid\n");

    if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
        fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
        return (ENOSPC);
    }

    udfidp = (struct udf_fid *)fidp;
    bzero((char *)udfidp, sizeof (struct udf_fid));
    rw_enter(&ip->i_contents, RW_READER);
    udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
    udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
    udfidp->udfid_prn = ip->i_icb_prn;
    udfidp->udfid_icb_lbn = ip->i_icb_block;
    rw_exit(&ip->i_contents);

    return (0);
}

/* ARGSUSED2 */
static int
udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
{
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_rwlock\n");

    if (write_lock) {
        rw_enter(&ip->i_rwlock, RW_WRITER);
    } else {
        rw_enter(&ip->i_rwlock, RW_READER);
    }
#ifdef  __lock_lint
    rw_exit(&ip->i_rwlock);
#endif
    return (write_lock);
}

/* ARGSUSED */
static void
udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
{
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_rwunlock\n");

#ifdef  __lock_lint
    rw_enter(&ip->i_rwlock, RW_WRITER);
#endif

    rw_exit(&ip->i_rwlock);

}

/* ARGSUSED */
static int32_t
udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
{
    return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
}

static int32_t
udf_frlock(struct vnode *vp, int32_t cmd, struct flock64 *bfp,
    int32_t flag, offset_t offset, struct flk_callback *flk_cbp,
    cred_t *cr)
{
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_frlock\n");

    /*
     * If file is being mapped, disallow frlock.
     * XXX I am not holding tlock while checking i_mapcnt because the
     * current locking strategy drops all locks before calling fs_frlock.
     * So, mapcnt could change before we enter fs_frlock making is
     * meaningless to have held tlock in the first place.
     */
    if ((ip->i_mapcnt > 0) &&
        (MANDLOCK(vp, ip->i_char))) {
        return (EAGAIN);
    }

    return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
}

/*ARGSUSED6*/
static int32_t
udf_space(
    struct vnode *vp,
    int32_t cmd,
    struct flock64 *bfp,
    int32_t flag,
    offset_t offset,
    cred_t *cr,
    caller_context_t *ct)
{
    int32_t error = 0;

    ud_printf("udf_space\n");

    if (cmd != F_FREESP) {
        error =  EINVAL;
    } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
        error = ud_freesp(vp, bfp, flag, cr);
    }

    return (error);
}

/* ARGSUSED */
static int32_t
udf_getpage(struct vnode *vp, offset_t off,
    size_t len, uint32_t *protp, struct page  **plarr,
    size_t plsz, struct seg *seg, caddr_t addr,
    enum seg_rw rw, struct cred *cr)
{
    struct ud_inode *ip = VTOI(vp);
    int32_t error, has_holes, beyond_eof, seqmode, dolock;
    int32_t pgsize = PAGESIZE;
    struct udf_vfs *udf_vfsp = ip->i_udf;
    page_t **pl;
    u_offset_t pgoff, eoff, uoff;
    krw_t rwtype;
    caddr_t pgaddr;

    ud_printf("udf_getpage\n");

    uoff = (u_offset_t)off; /* type conversion */
    if (protp) {
        *protp = PROT_ALL;
    }
    if (vp->v_flag & VNOMAP) {
        return (ENOSYS);
    }
    seqmode = ip->i_nextr == uoff && rw != S_CREATE;

    rwtype = RW_READER;
    dolock = (rw_owner(&ip->i_contents) != curthread);
retrylock:
#ifdef  __lock_lint
    rw_enter(&ip->i_contents, rwtype);
#else
    if (dolock) {
        rw_enter(&ip->i_contents, rwtype);
    }
#endif

    /*
     * We may be getting called as a side effect of a bmap using
     * fbread() when the blocks might be being allocated and the
     * size has not yet been up'ed.  In this case we want to be
     * able to return zero pages if we get back UDF_HOLE from
     * calling bmap for a non write case here.  We also might have
     * to read some frags from the disk into a page if we are
     * extending the number of frags for a given lbn in bmap().
     */
    beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
    if (beyond_eof && seg != segkmap) {
#ifdef  __lock_lint
        rw_exit(&ip->i_contents);
#else
        if (dolock) {
            rw_exit(&ip->i_contents);
        }
#endif
        return (EFAULT);
    }

    /*
     * Must hold i_contents lock throughout the call to pvn_getpages
     * since locked pages are returned from each call to ud_getapage.
     * Must *not* return locked pages and then try for contents lock
     * due to lock ordering requirements (inode > page)
     */

    has_holes = ud_bmap_has_holes(ip);

    if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
        int32_t blk_size, count;
        u_offset_t offset;

        /*
         * We must acquire the RW_WRITER lock in order to
         * call bmap_write().
         */
        if (dolock && rwtype == RW_READER) {
            rwtype = RW_WRITER;

            if (!rw_tryupgrade(&ip->i_contents)) {

                rw_exit(&ip->i_contents);

                goto retrylock;
            }
        }

        /*
         * May be allocating disk blocks for holes here as
         * a result of mmap faults. write(2) does the bmap_write
         * in rdip/wrip, not here. We are not dealing with frags
         * in this case.
         */
        offset = uoff;
        while ((offset < uoff + len) &&
            (offset < ip->i_size)) {
            /*
             * the variable "bnp" is to simplify the expression for
             * the compiler; * just passing in &bn to bmap_write
             * causes a compiler "loop"
             */

            blk_size = udf_vfsp->udf_lbsize;
            if ((offset + blk_size) > ip->i_size) {
                count = ip->i_size - offset;
            } else {
                count = blk_size;
            }
            error = ud_bmap_write(ip, offset, count, 0, cr);
            if (error) {
                goto update_inode;
            }
            offset += count; /* XXX - make this contig */
        }
    }

    /*
     * Can be a reader from now on.
     */
#ifdef  __lock_lint
    if (rwtype == RW_WRITER) {
        rw_downgrade(&ip->i_contents);
    }
#else
    if (dolock && rwtype == RW_WRITER) {
        rw_downgrade(&ip->i_contents);
    }
#endif

    /*
     * We remove PROT_WRITE in cases when the file has UDF holes
     * because we don't  want to call bmap_read() to check each
     * page if it is backed with a disk block.
     */
    if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
        *protp &= ~PROT_WRITE;
    }

    error = 0;

    /*
     * The loop looks up pages in the range <off, off + len).
     * For each page, we first check if we should initiate an asynchronous
     * read ahead before we call page_lookup (we may sleep in page_lookup
     * for a previously initiated disk read).
     */
    eoff = (uoff + len);
    for (pgoff = uoff, pgaddr = addr, pl = plarr;
            pgoff < eoff; /* empty */) {
        page_t  *pp;
        u_offset_t  nextrio;
        se_t    se;

        se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);

        /*
         * Handle async getpage (faultahead)
         */
        if (plarr == NULL) {
            ip->i_nextrio = pgoff;
            ud_getpage_ra(vp, pgoff, seg, pgaddr);
            pgoff += pgsize;
            pgaddr += pgsize;
            continue;
        }

        /*
         * Check if we should initiate read ahead of next cluster.
         * We call page_exists only when we need to confirm that
         * we have the current page before we initiate the read ahead.
         */
        nextrio = ip->i_nextrio;
        if (seqmode &&
            pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
            nextrio < ip->i_size && page_exists(vp, pgoff))
            ud_getpage_ra(vp, pgoff, seg, pgaddr);

        if ((pp = page_lookup(vp, pgoff, se)) != NULL) {

            /*
             * We found the page in the page cache.
             */
            *pl++ = pp;
            pgoff += pgsize;
            pgaddr += pgsize;
            len -= pgsize;
            plsz -= pgsize;
        } else  {

            /*
             * We have to create the page, or read it from disk.
             */
            if (error = ud_getpage_miss(vp, pgoff, len,
                seg, pgaddr, pl, plsz, rw, seqmode)) {
                goto error_out;
            }

            while (*pl != NULL) {
                pl++;
                pgoff += pgsize;
                pgaddr += pgsize;
                len -= pgsize;
                plsz -= pgsize;
            }
        }
    }

    /*
     * Return pages up to plsz if they are in the page cache.
     * We cannot return pages if there is a chance that they are
     * backed with a UDF hole and rw is S_WRITE or S_CREATE.
     */
    if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {

        ASSERT((protp == NULL) ||
            !(has_holes && (*protp & PROT_WRITE)));

        eoff = pgoff + plsz;
        while (pgoff < eoff) {
            page_t      *pp;

            if ((pp = page_lookup_nowait(vp, pgoff,
                SE_SHARED)) == NULL)
                break;

            *pl++ = pp;
            pgoff += pgsize;
            plsz -= pgsize;
        }
    }

    if (plarr)
        *pl = NULL;         /* Terminate page list */
    ip->i_nextr = pgoff;

error_out:
    if (error && plarr) {
        /*
         * Release any pages we have locked.
         */
        while (pl > &plarr[0])
            page_unlock(*--pl);

        plarr[0] = NULL;
    }

update_inode:
#ifdef  __lock_lint
    rw_exit(&ip->i_contents);
#else
    if (dolock) {
        rw_exit(&ip->i_contents);
    }
#endif

    /*
     * If the inode is not already marked for IACC (in rwip() for read)
     * and the inode is not marked for no access time update (in rwip()
     * for write) then update the inode access time and mod time now.
     */
    mutex_enter(&ip->i_tlock);
    if ((ip->i_flag & (IACC | INOACC)) == 0) {
        if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
            ip->i_flag |= IACC;
        }
        if (rw == S_WRITE) {
            ip->i_flag |= IUPD;
        }
        ITIMES_NOLOCK(ip);
    }
    mutex_exit(&ip->i_tlock);

    return (error);
}

int32_t ud_delay = 1;

/* ARGSUSED */
static int32_t
udf_putpage(struct vnode *vp, offset_t off,
    size_t len, int32_t flags, struct cred *cr)
{
    struct ud_inode *ip;
    int32_t error = 0;

    ud_printf("udf_putpage\n");

    ip = VTOI(vp);
#ifdef  __lock_lint
    rw_enter(&ip->i_contents, RW_WRITER);
#endif

    if (vp->v_count == 0) {
        cmn_err(CE_WARN, "ud_putpage : bad v_count");
        error = EINVAL;
        goto out;
    }

    if (vp->v_flag & VNOMAP) {
        error = ENOSYS;
        goto out;
    }

    if (flags & B_ASYNC) {
        if (ud_delay && len &&
            (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
            mutex_enter(&ip->i_tlock);

            /*
             * If nobody stalled, start a new cluster.
             */
            if (ip->i_delaylen == 0) {
                ip->i_delayoff = off;
                ip->i_delaylen = len;
                mutex_exit(&ip->i_tlock);
                goto out;
            }

            /*
             * If we have a full cluster or they are not contig,
             * then push last cluster and start over.
             */
            if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
                ip->i_delayoff + ip->i_delaylen != off) {
                u_offset_t doff;
                size_t dlen;

                doff = ip->i_delayoff;
                dlen = ip->i_delaylen;
                ip->i_delayoff = off;
                ip->i_delaylen = len;
                mutex_exit(&ip->i_tlock);
                error = ud_putpages(vp, doff, dlen, flags, cr);
                /* LMXXX - flags are new val, not old */
                goto out;
            }

            /*
             * There is something there, it's not full, and
             * it is contig.
             */
            ip->i_delaylen += len;
            mutex_exit(&ip->i_tlock);
            goto out;
        }

        /*
         * Must have weird flags or we are not clustering.
         */
    }

    error = ud_putpages(vp, off, len, flags, cr);

out:
#ifdef  __lock_lint
    rw_exit(&ip->i_contents);
#endif
    return (error);
}

static int32_t
udf_map(struct vnode *vp, offset_t off,
    struct as *as, caddr_t *addrp, size_t len,
    uint8_t prot, uint8_t maxprot, uint32_t flags,
    struct cred *cr)
{
    struct segvn_crargs vn_a;
    int32_t error = 0;

    ud_printf("udf_map\n");

    if (vp->v_flag & VNOMAP) {
        error = ENOSYS;
        goto end;
    }

    if ((off < (offset_t)0) ||
        ((off + len) < (offset_t)0)) {
        error = EINVAL;
        goto end;
    }

    if (vp->v_type != VREG) {
        error = ENODEV;
        goto end;
    }

    /*
     * If file is being locked, disallow mapping.
     */
    if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
        error = EAGAIN;
        goto end;
    }

    as_rangelock(as);
    if ((flags & MAP_FIXED) == 0) {
        map_addr(addrp, len, off, 1, flags);
        if (*addrp == NULL) {
            as_rangeunlock(as);
            error = ENOMEM;
            goto end;
        }
    } else {
        /*
         * User specified address - blow away any previous mappings
         */
        (void) as_unmap(as, *addrp, len);
    }

    vn_a.vp = vp;
    vn_a.offset = off;
    vn_a.type = flags & MAP_TYPE;
    vn_a.prot = prot;
    vn_a.maxprot = maxprot;
    vn_a.cred = cr;
    vn_a.amp = NULL;
    vn_a.flags = flags & ~MAP_TYPE;
    vn_a.szc = 0;
    vn_a.lgrp_mem_policy_flags = 0;

    error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
    as_rangeunlock(as);

end:
    return (error);
}

/* ARGSUSED */
static int32_t
udf_addmap(struct vnode *vp, offset_t off,
    struct as *as, caddr_t addr, size_t len,
    uint8_t prot, uint8_t maxprot, uint32_t flags,
    struct cred *cr)
{
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_addmap\n");

    if (vp->v_flag & VNOMAP) {
        return (ENOSYS);
    }

    mutex_enter(&ip->i_tlock);
    ip->i_mapcnt += btopr(len);
    mutex_exit(&ip->i_tlock);

    return (0);
}

/* ARGSUSED */
static int32_t
udf_delmap(struct vnode *vp, offset_t off,
    struct as *as, caddr_t addr, size_t len,
    uint32_t prot, uint32_t maxprot, uint32_t flags,
    struct cred *cr)
{
    struct ud_inode *ip = VTOI(vp);

    ud_printf("udf_delmap\n");

    if (vp->v_flag & VNOMAP) {
        return (ENOSYS);
    }

    mutex_enter(&ip->i_tlock);
    ip->i_mapcnt -= btopr(len);     /* Count released mappings */
    ASSERT(ip->i_mapcnt >= 0);
    mutex_exit(&ip->i_tlock);

    return (0);
}

static int32_t
udf_l_pathconf(struct vnode *vp, int32_t cmd,
    ulong_t *valp, struct cred *cr)
{
    int32_t error = 0;

    ud_printf("udf_l_pathconf\n");

    if (cmd == _PC_FILESIZEBITS) {
        /*
         * udf supports 64 bits as file size
         * but there are several other restrictions
         * it only supports 32-bit block numbers and
         * daddr32_t is only and int32_t so taking these
         * into account we can stay just as where ufs is
         */
        *valp = 41;
    } else {
        error = fs_pathconf(vp, cmd, valp, cr);
    }

    return (error);
}

uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
#ifndef __lint
_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
#endif
/*
 * Assumption is that there will not be a pageio request
 * to a enbedded file
 */
/* ARGSUSED */
static int32_t
udf_pageio(struct vnode *vp, struct page *pp,
    u_offset_t io_off, size_t io_len,
    int32_t flags, struct cred *cr)
{
    daddr_t bn;
    struct buf *bp;
    struct ud_inode *ip = VTOI(vp);
    int32_t dolock, error = 0, contig, multi_io;
    size_t done_len = 0, cur_len = 0;
    page_t *npp = NULL, *opp = NULL, *cpp = pp;

    if (pp == NULL) {
        return (EINVAL);
    }

    dolock = (rw_owner(&ip->i_contents) != curthread);

    /*
     * We need a better check.  Ideally, we would use another
     * vnodeops so that hlocked and forcibly unmounted file
     * systems would return EIO where appropriate and w/o the
     * need for these checks.
     */
    if (ip->i_udf == NULL) {
        return (EIO);
    }

#ifdef  __lock_lint
    rw_enter(&ip->i_contents, RW_READER);
#else
    if (dolock) {
        rw_enter(&ip->i_contents, RW_READER);
    }
#endif

    /*
     * Break the io request into chunks, one for each contiguous
     * stretch of disk blocks in the target file.
     */
    while (done_len < io_len) {
        ASSERT(cpp);
        bp = NULL;
        contig = 0;
        if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
                    &bn, &contig)) {
            break;
        }

        if (bn == UDF_HOLE) {   /* No holey swapfiles */
            cmn_err(CE_WARN, "SWAP file has HOLES");
            error = EINVAL;
            break;
        }

        cur_len = MIN(io_len - done_len, contig);

        /*
         * Check if more than one I/O is
         * required to complete the given
         * I/O operation
         */
        if (ip->i_udf->udf_lbsize < PAGESIZE) {
            if (cur_len >= PAGESIZE) {
                multi_io = 0;
                cur_len &= PAGEMASK;
            } else {
                multi_io = 1;
                cur_len = MIN(io_len - done_len, PAGESIZE);
            }
        }
        page_list_break(&cpp, &npp, btop(cur_len));

        bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
        ASSERT(bp != NULL);

        bp->b_edev = ip->i_dev;
        bp->b_dev = cmpdev(ip->i_dev);
        bp->b_blkno = bn;
        bp->b_un.b_addr = (caddr_t)0;
        bp->b_file = vp;
        bp->b_offset = (offset_t)(io_off + done_len);

/*
 *      ub.ub_pageios.value.ul++;
 */
        if (multi_io == 0) {
            (void) bdev_strategy(bp);
        } else {
            error = ud_multi_strat(ip, cpp, bp,
                (u_offset_t)(io_off + done_len));
            if (error != 0) {
                pageio_done(bp);
                break;
            }
        }
        if (flags & B_READ) {
            ud_pageio_reads++;
        } else {
            ud_pageio_writes++;
        }

        /*
         * If the request is not B_ASYNC, wait for i/o to complete
         * and re-assemble the page list to return to the caller.
         * If it is B_ASYNC we leave the page list in pieces and
         * cleanup() will dispose of them.
         */
        if ((flags & B_ASYNC) == 0) {
            error = biowait(bp);
            pageio_done(bp);
            if (error) {
                break;
            }
            page_list_concat(&opp, &cpp);
        }
        cpp = npp;
        npp = NULL;
        done_len += cur_len;
    }

    ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
    if (error) {
        if (flags & B_ASYNC) {
            /* Cleanup unprocessed parts of list */
            page_list_concat(&cpp, &npp);
            if (flags & B_READ) {
                pvn_read_done(cpp, B_ERROR);
            } else {
                pvn_write_done(cpp, B_ERROR);
            }
        } else {
            /* Re-assemble list and let caller clean up */
            page_list_concat(&opp, &cpp);
            page_list_concat(&opp, &npp);
        }
    }

#ifdef  __lock_lint
    rw_exit(&ip->i_contents);
#else
    if (dolock) {
        rw_exit(&ip->i_contents);
    }
#endif
    return (error);
}


/* -------------------- local functions --------------------------- */


int32_t
ud_rdwri(enum uio_rw rw, int32_t ioflag,
    struct ud_inode *ip, caddr_t base, int32_t len,
    offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
{
    int32_t error;
    struct uio auio;
    struct iovec aiov;

    ud_printf("ud_rdwri\n");

    bzero((caddr_t)&auio, sizeof (uio_t));
    bzero((caddr_t)&aiov, sizeof (iovec_t));

    aiov.iov_base = base;
    aiov.iov_len = len;
    auio.uio_iov = &aiov;
    auio.uio_iovcnt = 1;
    auio.uio_loffset = offset;
    auio.uio_segflg = (int16_t)seg;
    auio.uio_resid = len;

    if (rw == UIO_WRITE) {
        auio.uio_fmode = FWRITE;
        auio.uio_extflg = UIO_COPY_DEFAULT;
        auio.uio_llimit = curproc->p_fsz_ctl;
        error = ud_wrip(ip, &auio, ioflag, cr);
    } else {
        auio.uio_fmode = FREAD;
        auio.uio_extflg = UIO_COPY_CACHED;
        auio.uio_llimit = MAXOFFSET_T;
        error = ud_rdip(ip, &auio, ioflag, cr);
    }

    if (aresid) {
        *aresid = auio.uio_resid;
    } else if (auio.uio_resid) {
        error = EIO;
    }
    return (error);
}

/*
 * Free behind hacks.  The pager is busted.
 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 * or B_FREE_IF_TIGHT_ON_MEMORY.
 */
int32_t ud_freebehind = 1;
int32_t ud_smallfile = 32 * 1024;

/* ARGSUSED */
int32_t
ud_getpage_miss(struct vnode *vp, u_offset_t off,
    size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
    size_t plsz, enum seg_rw rw, int32_t seq)
{
    struct ud_inode *ip = VTOI(vp);
    int32_t err = 0;
    size_t io_len;
    u_offset_t io_off;
    u_offset_t pgoff;
    page_t *pp;

    pl[0] = NULL;

    /*
     * Figure out whether the page can be created, or must be
     * read from the disk
     */
    if (rw == S_CREATE) {
        if ((pp = page_create_va(vp, off,
                PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
            cmn_err(CE_WARN, "ud_getpage_miss: page_create");
            return (EINVAL);
        }
        io_len = PAGESIZE;
    } else {
        pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
                &io_len, off, PAGESIZE, 0);

        /*
         * Some other thread has entered the page.
         * ud_getpage will retry page_lookup.
         */
        if (pp == NULL) {
            return (0);
        }

        /*
         * Fill the page with as much data as we can from the file.
         */
        err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
        if (err) {
            pvn_read_done(pp, B_ERROR);
            return (err);
        }

        /*
         * XXX ??? ufs has io_len instead of pgoff below
         */
        ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);

        /*
         * If the file access is sequential, initiate read ahead
         * of the next cluster.
         */
        if (seq && ip->i_nextrio < ip->i_size) {
            ud_getpage_ra(vp, off, seg, addr);
        }
    }

outmiss:
    pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
    return (err);
}

/* ARGSUSED */
void
ud_getpage_ra(struct vnode *vp,
    u_offset_t off, struct seg *seg, caddr_t addr)
{
    page_t *pp;
    size_t io_len;
    struct ud_inode *ip = VTOI(vp);
    u_offset_t io_off = ip->i_nextrio, pgoff;
    caddr_t addr2 = addr + (io_off - off);
    daddr_t bn;
    int32_t contig = 0;

    /*
     * Is this test needed?
     */

    if (addr2 >= seg->s_base + seg->s_size) {
        return;
    }

    contig = 0;
    if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
        return;
    }

    pp = pvn_read_kluster(vp, io_off, seg, addr2,
        &io_off, &io_len, io_off, PAGESIZE, 1);

    /*
     * Some other thread has entered the page.
     * So no read head done here (ie we will have to and wait
     * for the read when needed).
     */

    if (pp == NULL) {
        return;
    }

    (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
    ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
}

int
ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
    uint32_t bflgs, u_offset_t *pg_off)
{
    daddr_t bn;
    struct buf *bp;
    caddr_t kaddr, caddr;
    int32_t error = 0, contig = 0, multi_io = 0;
    int32_t lbsize = ip->i_udf->udf_lbsize;
    int32_t lbmask = ip->i_udf->udf_lbmask;
    uint64_t isize;

    isize = (ip->i_size + lbmask) & (~lbmask);
    if (ip->i_desc_type == ICB_FLAG_ONE_AD) {

        /*
         * Embedded file read file_entry
         * from buffer cache and copy the required
         * portions
         */
        bp = ud_bread(ip->i_dev,
        ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
        if ((bp->b_error == 0) &&
            (bp->b_resid == 0)) {

            caddr = bp->b_un.b_addr + ip->i_data_off;

            /*
             * mapin to kvm
             */
            kaddr = (caddr_t)ppmapin(pp,
                PROT_READ | PROT_WRITE, (caddr_t)-1);
            (void) kcopy(caddr, kaddr, ip->i_size);

            /*
             * mapout of kvm
             */
            ppmapout(kaddr);
        }
        brelse(bp);
        contig = ip->i_size;
    } else {

        /*
         * Get the continuous size and block number
         * at offset "off"
         */
        if (error = ud_bmap_read(ip, off, &bn, &contig))
            goto out;
        contig = MIN(contig, PAGESIZE);
        contig = (contig + lbmask) & (~lbmask);

        /*
         * Zero part of the page which we are not
         * going to read from the disk.
         */

        if (bn == UDF_HOLE) {

            /*
             * This is a HOLE. Just zero out
             * the page
             */
            if (((off + contig) == isize) ||
                (contig == PAGESIZE)) {
                pagezero(pp->p_prev, 0, PAGESIZE);
                goto out;
            }
        }

        if (contig < PAGESIZE) {
            uint64_t count;

            count = isize - off;
            if (contig != count) {
                multi_io = 1;
                contig = (int32_t)(MIN(count, PAGESIZE));
            } else {
                pagezero(pp->p_prev, contig, PAGESIZE - contig);
            }
        }

        /*
         * Get a bp and initialize it
         */
        bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
        ASSERT(bp != NULL);

        bp->b_edev = ip->i_dev;
        bp->b_dev = cmpdev(ip->i_dev);
        bp->b_blkno = bn;
        bp->b_un.b_addr = 0;
        bp->b_file = ip->i_vnode;

        /*
         * Start I/O
         */
        if (multi_io == 0) {

            /*
             * Single I/O is sufficient for this page
             */
            (void) bdev_strategy(bp);
        } else {

            /*
             * We need to do the I/O in
             * piece's
             */
            error = ud_multi_strat(ip, pp, bp, off);
            if (error != 0) {
                goto out;
            }
        }
        if ((bflgs & B_ASYNC) == 0) {

            /*
             * Wait for i/o to complete.
             */

            error = biowait(bp);
            pageio_done(bp);
            if (error) {
                goto out;
            }
        }
    }
    if ((off + contig) >= ip->i_size) {
        contig = ip->i_size - off;
    }

out:
    *pg_off = contig;
    return (error);
}

int32_t
ud_putpages(struct vnode *vp, offset_t off,
    size_t len, int32_t flags, struct cred *cr)
{
    struct ud_inode *ip;
    page_t *pp;
    u_offset_t io_off;
    size_t io_len;
    u_offset_t eoff;
    int32_t err = 0;
    int32_t dolock;

    ud_printf("ud_putpages\n");

    if (vp->v_count == 0) {
        cmn_err(CE_WARN, "ud_putpages: bad v_count");
        return (EINVAL);
    }

    ip = VTOI(vp);

    /*
     * Acquire the readers/write inode lock before locking
     * any pages in this inode.
     * The inode lock is held during i/o.
     */
    if (len == 0) {
        mutex_enter(&ip->i_tlock);
        ip->i_delayoff = ip->i_delaylen = 0;
        mutex_exit(&ip->i_tlock);
    }
#ifdef  __lock_lint
    rw_enter(&ip->i_contents, RW_READER);
#else
    dolock = (rw_owner(&ip->i_contents) != curthread);
    if (dolock) {
        rw_enter(&ip->i_contents, RW_READER);
    }
#endif

    if (!vn_has_cached_data(vp)) {
#ifdef  __lock_lint
        rw_exit(&ip->i_contents);
#else
        if (dolock) {
            rw_exit(&ip->i_contents);
        }
#endif
        return (0);
    }

    if (len == 0) {
        /*
         * Search the entire vp list for pages >= off.
         */
        err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
                    flags, cr);
    } else {
        /*
         * Loop over all offsets in the range looking for
         * pages to deal with.
         */
        if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
            eoff = MIN(off + len, eoff);
        } else {
            eoff = off + len;
        }

        for (io_off = off; io_off < eoff; io_off += io_len) {
            /*
             * If we are not invalidating, synchronously
             * freeing or writing pages, use the routine
             * page_lookup_nowait() to prevent reclaiming
             * them from the free list.
             */
            if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
                pp = page_lookup(vp, io_off,
                    (flags & (B_INVAL | B_FREE)) ?
                        SE_EXCL : SE_SHARED);
            } else {
                pp = page_lookup_nowait(vp, io_off,
                    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
            }

            if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
                io_len = PAGESIZE;
            } else {

                err = ud_putapage(vp, pp,
                    &io_off, &io_len, flags, cr);
                if (err != 0) {
                    break;
                }
                /*
                 * "io_off" and "io_len" are returned as
                 * the range of pages we actually wrote.
                 * This allows us to skip ahead more quickly
                 * since several pages may've been dealt
                 * with by this iteration of the loop.
                 */
            }
        }
    }
    if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
        /*
         * We have just sync'ed back all the pages on
         * the inode, turn off the IMODTIME flag.
         */
        mutex_enter(&ip->i_tlock);
        ip->i_flag &= ~IMODTIME;
        mutex_exit(&ip->i_tlock);
    }
#ifdef  __lock_lint
    rw_exit(&ip->i_contents);
#else
    if (dolock) {
        rw_exit(&ip->i_contents);
    }
#endif
    return (err);
}

/* ARGSUSED */
int32_t
ud_putapage(struct vnode *vp,
    page_t *pp, u_offset_t *offp,
    size_t *lenp, int32_t flags, struct cred *cr)
{
    daddr_t bn;
    size_t io_len;
    struct ud_inode *ip;
    int32_t error = 0, contig, multi_io = 0;
    struct udf_vfs *udf_vfsp;
    u_offset_t off, io_off;
    caddr_t kaddr, caddr;
    struct buf *bp = NULL;
    int32_t lbmask;
    uint64_t isize;
    int32_t crc_len;
    struct file_entry *fe;

    ud_printf("ud_putapage\n");

    ip = VTOI(vp);
    ASSERT(ip);
    ASSERT(RW_LOCK_HELD(&ip->i_contents));
    lbmask = ip->i_udf->udf_lbmask;
    isize = (ip->i_size + lbmask) & (~lbmask);

    udf_vfsp = ip->i_udf;
    ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);

    /*
     * If the modified time on the inode has not already been
     * set elsewhere (e.g. for write/setattr) we set the time now.
     * This gives us approximate modified times for mmap'ed files
     * which are modified via stores in the user address space.
     */
    if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
        mutex_enter(&ip->i_tlock);
        ip->i_flag |= IUPD;
        ITIMES_NOLOCK(ip);
        mutex_exit(&ip->i_tlock);
    }


    /*
     * Align the request to a block boundry (for old file systems),
     * and go ask bmap() how contiguous things are for this file.
     */
    off = pp->p_offset & ~(offset_t)lbmask;
                /* block align it */


    if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
        ASSERT(ip->i_size <= ip->i_max_emb);

        pp = pvn_write_kluster(vp, pp, &io_off,
            &io_len, off, PAGESIZE, flags);
        if (io_len == 0) {
            io_len = PAGESIZE;
        }

        bp = ud_bread(ip->i_dev,
            ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
            udf_vfsp->udf_lbsize);
        fe = (struct file_entry *)bp->b_un.b_addr;
        if ((bp->b_flags & B_ERROR) ||
            (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
            ip->i_icb_block,
            1, udf_vfsp->udf_lbsize) != 0)) {
            if (pp != NULL)
                pvn_write_done(pp, B_ERROR | B_WRITE | flags);
            if (bp->b_flags & B_ERROR) {
                error = EIO;
            } else {
                error = EINVAL;
            }
            brelse(bp);
            return (error);
        }
        if ((bp->b_error == 0) &&
            (bp->b_resid == 0)) {

            caddr = bp->b_un.b_addr + ip->i_data_off;
            kaddr = (caddr_t)ppmapin(pp,
                PROT_READ | PROT_WRITE, (caddr_t)-1);
            (void) kcopy(kaddr, caddr, ip->i_size);
            ppmapout(kaddr);
        }
        crc_len = ((uint32_t)&((struct file_entry *)0)->fe_spec) +
                SWAP_32(fe->fe_len_ear);
        crc_len += ip->i_size;
        ud_make_tag(ip->i_udf, &fe->fe_tag,
            UD_FILE_ENTRY, ip->i_icb_block, crc_len);

        bwrite(bp);

        if (flags & B_ASYNC) {
            pvn_write_done(pp, flags);
        }
        contig = ip->i_size;
    } else {

        if (error = ud_bmap_read(ip, off, &bn, &contig)) {
            goto out;
        }
        contig = MIN(contig, PAGESIZE);
        contig = (contig + lbmask) & (~lbmask);

        if (contig < PAGESIZE) {
            uint64_t count;

            count = isize - off;
            if (contig != count) {
                multi_io = 1;
                contig = (int32_t)(MIN(count, PAGESIZE));
            }
        }

        if ((off + contig) > isize) {
            contig = isize - off;
        }

        if (contig > PAGESIZE) {
            if (contig & PAGEOFFSET) {
                contig &= PAGEMASK;
            }
        }

        pp = pvn_write_kluster(vp, pp, &io_off,
            &io_len, off, contig, flags);
        if (io_len == 0) {
            io_len = PAGESIZE;
        }

        bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
        ASSERT(bp != NULL);

        bp->b_edev = ip->i_dev;
        bp->b_dev = cmpdev(ip->i_dev);
        bp->b_blkno = bn;
        bp->b_un.b_addr = 0;
        bp->b_file = vp;
        bp->b_offset = (offset_t)off;


        /*
         * write throttle
         */
        ASSERT(bp->b_iodone == NULL);
        bp->b_iodone = ud_iodone;
        mutex_enter(&ip->i_tlock);
        ip->i_writes += bp->b_bcount;
        mutex_exit(&ip->i_tlock);

        if (multi_io == 0) {

            (void) bdev_strategy(bp);
        } else {
            error = ud_multi_strat(ip, pp, bp, off);
            if (error != 0) {
                goto out;
            }
        }

        if ((flags & B_ASYNC) == 0) {
            /*
             * Wait for i/o to complete.
             */
            error = biowait(bp);
            pageio_done(bp);
        }
    }

    if ((flags & B_ASYNC) == 0) {
        pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
    }

    pp = NULL;

out:
    if (error != 0 && pp != NULL) {
        pvn_write_done(pp, B_ERROR | B_WRITE | flags);
    }

    if (offp) {
        *offp = io_off;
    }
    if (lenp) {
        *lenp = io_len;
    }

    return (error);
}


int32_t
ud_iodone(struct buf *bp)
{
    struct ud_inode *ip;

    ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));

    bp->b_iodone = NULL;

    ip = VTOI(bp->b_pages->p_vnode);

    mutex_enter(&ip->i_tlock);
    if (ip->i_writes >= ud_LW) {
        if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
            if (ud_WRITES) {
                cv_broadcast(&ip->i_wrcv); /* wake all up */
            }
        }
    } else {
        ip->i_writes -= bp->b_bcount;
    }
    mutex_exit(&ip->i_tlock);
    iodone(bp);
    return (0);
}

/* ARGSUSED3 */
int32_t
ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
{
    struct vnode *vp;
    struct udf_vfs *udf_vfsp;
    krw_t rwtype;
    caddr_t base;
    uint32_t flags;
    int32_t error, n, on, mapon, dofree;
    u_offset_t off;
    long oresid = uio->uio_resid;

    ASSERT(RW_LOCK_HELD(&ip->i_contents));
    if ((ip->i_type != VREG) &&
        (ip->i_type != VDIR) &&
        (ip->i_type != VLNK)) {
        return (EIO);
    }

    if (uio->uio_loffset > MAXOFFSET_T) {
        return (0);
    }

    if ((uio->uio_loffset < (offset_t)0) ||
        ((uio->uio_loffset + uio->uio_resid) < 0)) {
        return (EINVAL);
    }
    if (uio->uio_resid == 0) {
        return (0);
    }

    vp = ITOV(ip);
    udf_vfsp = ip->i_udf;
    mutex_enter(&ip->i_tlock);
    ip->i_flag |= IACC;
    mutex_exit(&ip->i_tlock);

    rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);

    do {
        offset_t diff;
        u_offset_t uoff = uio->uio_loffset;
        off = uoff & (offset_t)MAXBMASK;
        mapon = (int)(uoff & (offset_t)MAXBOFFSET);
        on = (int)blkoff(udf_vfsp, uoff);
        n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);

        diff = ip->i_size - uoff;

        if (diff <= (offset_t)0) {
            error = 0;
            goto out;
        }
        if (diff < (offset_t)n) {
            n = (int)diff;
        }
        dofree = ud_freebehind &&
            ip->i_nextr == (off & PAGEMASK) &&
                off > ud_smallfile;

#ifndef __lock_lint
        if (rwtype == RW_READER) {
            rw_exit(&ip->i_contents);
        }
#endif

        base = segmap_getmapflt(segkmap, vp, (off + mapon),
                    (uint32_t)n, 1, S_READ);
        error = uiomove(base + mapon, (long)n, UIO_READ, uio);

        flags = 0;
        if (!error) {
            /*
             * If read a whole block, or read to eof,
             * won't need this buffer again soon.
             */
            if (n + on == MAXBSIZE && ud_freebehind && dofree &&
                freemem < lotsfree + pages_before_pager) {
                flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
            }
            /*
             * In POSIX SYNC (FSYNC and FDSYNC) read mode,
             * we want to make sure that the page which has
             * been read, is written on disk if it is dirty.
             * And corresponding indirect blocks should also
             * be flushed out.
             */
            if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
                flags &= ~SM_ASYNC;
                flags |= SM_WRITE;
            }
            error = segmap_release(segkmap, base, flags);
        } else    {
            (void) segmap_release(segkmap, base, flags);
        }

#ifndef __lock_lint
        if (rwtype == RW_READER) {
            rw_enter(&ip->i_contents, rwtype);
        }
#endif
    } while (error == 0 && uio->uio_resid > 0 && n != 0);
out:
    /*
     * Inode is updated according to this table if FRSYNC is set.
     *
     *  FSYNC   FDSYNC(posix.4)
     *  --------------------------
     *  always  IATTCHG|IBDWRITE
     */
    if (ioflag & FRSYNC) {
        if ((ioflag & FSYNC) ||
        ((ioflag & FDSYNC) && (ip->i_flag & (IATTCHG|IBDWRITE)))) {
        rw_exit(&ip->i_contents);
        rw_enter(&ip->i_contents, RW_WRITER);
        ud_iupdat(ip, 1);
        }
    }
    /*
     * If we've already done a partial read, terminate
     * the read but return no error.
     */
    if (oresid != uio->uio_resid) {
        error = 0;
    }
    ITIMES(ip);

    return (error);
}

int32_t
ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
{
    caddr_t base;
    struct vnode *vp;
    struct udf_vfs *udf_vfsp;
    uint32_t flags;
    int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
    int32_t pagecreate, newpage;
    uint64_t old_i_size;
    u_offset_t off;
    long start_resid = uio->uio_resid, premove_resid;
    rlim64_t limit = uio->uio_limit;


    ASSERT(RW_WRITE_HELD(&ip->i_contents));
    if ((ip->i_type != VREG) &&
        (ip->i_type != VDIR) &&
        (ip->i_type != VLNK)) {
        return (EIO);
    }

    if (uio->uio_loffset >= MAXOFFSET_T) {
        return (EFBIG);
    }
    /*
     * see udf_l_pathconf
     */
    if (limit > (((uint64_t)1 << 40) - 1)) {
        limit = ((uint64_t)1 << 40) - 1;
    }
    if (uio->uio_loffset >= limit) {
        proc_t *p = ttoproc(curthread);

        mutex_enter(&p->p_lock);
        (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
            p, RCA_UNSAFE_SIGINFO);
        mutex_exit(&p->p_lock);
        return (EFBIG);
    }
    if ((uio->uio_loffset < (offset_t)0) ||
        ((uio->uio_loffset + uio->uio_resid) < 0)) {
        return (EINVAL);
    }
    if (uio->uio_resid == 0) {
        return (0);
    }

    mutex_enter(&ip->i_tlock);
    ip->i_flag |= INOACC;

    if (ioflag & (FSYNC | FDSYNC)) {
        ip->i_flag |= ISYNC;
        iupdat_flag = 1;
    }
    mutex_exit(&ip->i_tlock);

    udf_vfsp = ip->i_udf;
    vp = ITOV(ip);

    do {
        u_offset_t uoff = uio->uio_loffset;
        off = uoff & (offset_t)MAXBMASK;
        mapon = (int)(uoff & (offset_t)MAXBOFFSET);
        on = (int)blkoff(udf_vfsp, uoff);
        n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);

        if (ip->i_type == VREG && uoff + n >= limit) {
            if (uoff >= limit) {
                error = EFBIG;
                goto out;
            }
            n = (int)(limit - (rlim64_t)uoff);
        }
        if (uoff + n > ip->i_size) {
            /*
             * We are extending the length of the file.
             * bmap is used so that we are sure that
             * if we need to allocate new blocks, that it
             * is done here before we up the file size.
             */
            error = ud_bmap_write(ip, uoff,
                    (int)(on + n), mapon == 0, cr);
            if (error) {
                break;
            }
            i_size_changed = 1;
            old_i_size = ip->i_size;
            ip->i_size = uoff + n;
            /*
             * If we are writing from the beginning of
             * the mapping, we can just create the
             * pages without having to read them.
             */
            pagecreate = (mapon == 0);
        } else if (n == MAXBSIZE) {
            /*
             * Going to do a whole mappings worth,
             * so we can just create the pages w/o
             * having to read them in.  But before
             * we do that, we need to make sure any
             * needed blocks are allocated first.
             */
            error = ud_bmap_write(ip, uoff,
                    (int)(on + n), 1, cr);
            if (error) {
                break;
            }
            pagecreate = 1;
        } else {
            pagecreate = 0;
        }

        rw_exit(&ip->i_contents);

        base = segmap_getmapflt(segkmap, vp, (off + mapon),
                (uint32_t)n, !pagecreate, S_WRITE);

        /*
         * segmap_pagecreate() returns 1 if it calls
         * page_create_va() to allocate any pages.
         */
        newpage = 0;
        if (pagecreate) {
            newpage = segmap_pagecreate(segkmap, base,
                    (size_t)n, 0);
        }

        premove_resid = uio->uio_resid;
        error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);

        if (pagecreate &&
            uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
            /*
             * We created pages w/o initializing them completely,
             * thus we need to zero the part that wasn't set up.
             * This happens on most EOF write cases and if
             * we had some sort of error during the uiomove.
             */
            int nzero, nmoved;

            nmoved = (int)(uio->uio_loffset - (off + mapon));
            ASSERT(nmoved >= 0 && nmoved <= n);
            nzero = roundup(on + n, PAGESIZE) - nmoved;
            ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
            (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
        }

        /*
         * Unlock the pages allocated by page_create_va()
         * in segmap_pagecreate()
         */
        if (newpage) {
            segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
        }

        if (error) {
            /*
             * If we failed on a write, we may have already
             * allocated file blocks as well as pages.  It's
             * hard to undo the block allocation, but we must
             * be sure to invalidate any pages that may have
             * been allocated.
             */
            (void) segmap_release(segkmap, base, SM_INVAL);
        } else {
            flags = 0;
            /*
             * Force write back for synchronous write cases.
             */
            if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
                /*
                 * If the sticky bit is set but the
                 * execute bit is not set, we do a
                 * synchronous write back and free
                 * the page when done.  We set up swap
                 * files to be handled this way to
                 * prevent servers from keeping around
                 * the client's swap pages too long.
                 * XXX - there ought to be a better way.
                 */
                if (IS_SWAPVP(vp)) {
                    flags = SM_WRITE | SM_FREE |
                            SM_DONTNEED;
                    iupdat_flag = 0;
                } else {
                    flags = SM_WRITE;
                }
            } else if (((mapon + n) == MAXBSIZE) ||
                    IS_SWAPVP(vp)) {
                /*
                 * Have written a whole block.
                 * Start an asynchronous write and
                 * mark the buffer to indicate that
                 * it won't be needed again soon.
                 */
                flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
            }
            error = segmap_release(segkmap, base, flags);

            /*
             * If the operation failed and is synchronous,
             * then we need to unwind what uiomove() last
             * did so we can potentially return an error to
             * the caller.  If this write operation was
             * done in two pieces and the first succeeded,
             * then we won't return an error for the second
             * piece that failed.  However, we only want to
             * return a resid value that reflects what was
             * really done.
             *
             * Failures for non-synchronous operations can
             * be ignored since the page subsystem will
             * retry the operation until it succeeds or the
             * file system is unmounted.
             */
            if (error) {
                if ((ioflag & (FSYNC | FDSYNC)) ||
                    ip->i_type == VDIR) {
                    uio->uio_resid = premove_resid;
                } else {
                    error = 0;
                }
            }
        }

        /*
         * Re-acquire contents lock.
         */
        rw_enter(&ip->i_contents, RW_WRITER);
        /*
         * If the uiomove() failed or if a synchronous
         * page push failed, fix up i_size.
         */
        if (error) {
            if (i_size_changed) {
                /*
                 * The uiomove failed, and we
                 * allocated blocks,so get rid
                 * of them.
                 */
                (void) ud_itrunc(ip, old_i_size, 0, cr);
            }
        } else {
            /*
             * XXX - Can this be out of the loop?
             */
            ip->i_flag |= IUPD | ICHG;
            if (i_size_changed) {
                ip->i_flag |= IATTCHG;
            }
            if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
                (IEXEC >> 10))) != 0 &&
                (ip->i_char & (ISUID | ISGID)) != 0 &&
                secpolicy_vnode_setid_retain(cr,
                (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
                /*
                 * Clear Set-UID & Set-GID bits on
                 * successful write if not privileged
                 * and at least one of the execute bits
                 * is set.  If we always clear Set-GID,
                 * mandatory file and record locking is
                 * unuseable.
                 */
                ip->i_char &= ~(ISUID | ISGID);
            }
        }
    } while (error == 0 && uio->uio_resid > 0 && n != 0);

out:
    /*
     * Inode is updated according to this table -
     *
     *  FSYNC   FDSYNC(posix.4)
     *  --------------------------
     *  always@ IATTCHG|IBDWRITE
     *
     * @ -  If we are doing synchronous write the only time we should
     *  not be sync'ing the ip here is if we have the stickyhack
     *  activated, the file is marked with the sticky bit and
     *  no exec bit, the file length has not been changed and
     *  no new blocks have been allocated during this write.
     */
    if ((ip->i_flag & ISYNC) != 0) {
        /*
         * we have eliminated nosync
         */
        if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
            ((ioflag & FSYNC) && iupdat_flag)) {
            ud_iupdat(ip, 1);
        }
    }

    /*
     * If we've already done a partial-write, terminate
     * the write but return no error.
     */
    if (start_resid != uio->uio_resid) {
        error = 0;
    }
    ip->i_flag &= ~(INOACC | ISYNC);
    ITIMES_NOLOCK(ip);

    return (error);
}

int32_t
ud_multi_strat(struct ud_inode *ip,
    page_t *pp, struct buf *bp, u_offset_t start)
{
    daddr_t bn;
    int32_t error = 0, io_count, contig, alloc_sz, i;
    uint32_t io_off;
    mio_master_t *mm = NULL;
    mio_slave_t *ms = NULL;
    struct buf *rbp;

    ASSERT(!(start & PAGEOFFSET));

    /*
     * Figure out how many buffers to allocate
     */
    io_count = 0;
    for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
        contig = 0;
        if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
                    &bn, &contig)) {
            goto end;
        }
        if (contig == 0) {
            goto end;
        }
        contig = MIN(contig, PAGESIZE - io_off);
        if (bn != UDF_HOLE) {
            io_count ++;
        } else {
            /*
             * HOLE
             */
            if (bp->b_flags & B_READ) {

                /*
                 * This is a hole and is read
                 * it should be filled with 0's
                 */
                pagezero(pp, io_off, contig);
            }
        }
    }


    if (io_count != 0) {

        /*
         * Allocate memory for all the
         * required number of buffers
         */
        alloc_sz = sizeof (mio_master_t) +
            (sizeof (mio_slave_t) * io_count);
        mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
        if (mm == NULL) {
            error = ENOMEM;
            goto end;
        }

        /*
         * initialize master
         */
        mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
        mm->mm_size = alloc_sz;
        mm->mm_bp = bp;
        mm->mm_resid = 0;
        mm->mm_error = 0;
        mm->mm_index = master_index++;

        ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));

        /*
         * Initialize buffers
         */
        io_count = 0;
        for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
            contig = 0;
            if (error = ud_bmap_read(ip,
                    (u_offset_t)(start + io_off),
                    &bn, &contig)) {
                goto end;
            }
            ASSERT(contig);
            if ((io_off + contig) > bp->b_bcount) {
                contig = bp->b_bcount - io_off;
            }
            if (bn != UDF_HOLE) {
                /*
                 * Clone the buffer
                 * and prepare to start I/O
                 */
                ms->ms_ptr = mm;
                bioinit(&ms->ms_buf);
                rbp = bioclone(bp, io_off, (size_t)contig,
                    bp->b_edev, bn, ud_slave_done,
                    &ms->ms_buf, KM_NOSLEEP);
                ASSERT(rbp == &ms->ms_buf);
                mm->mm_resid += contig;
                io_count++;
                ms ++;
            }
        }

        /*
         * Start I/O's
         */
        ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
        for (i = 0; i < io_count; i++) {
            (void) bdev_strategy(&ms->ms_buf);
            ms ++;
        }
    }

end:
    if (error != 0) {
        bp->b_flags |= B_ERROR;
        bp->b_error = error;
        if (mm != NULL) {
            mutex_destroy(&mm->mm_mutex);
            kmem_free(mm, mm->mm_size);
        }
    }
    return (error);
}

int32_t
ud_slave_done(struct buf *bp)
{
    mio_master_t *mm;
    int32_t resid;

    ASSERT(SEMA_HELD(&bp->b_sem));
    ASSERT((bp->b_flags & B_DONE) == 0);

    mm = ((mio_slave_t *)bp)->ms_ptr;

    /*
     * Propagate error and byte count info from slave struct to
     * the master struct
     */
    mutex_enter(&mm->mm_mutex);
    if (bp->b_flags & B_ERROR) {

        /*
         * If multiple slave buffers get
         * error we forget the old errors
         * this is ok because we any way
         * cannot return multiple errors
         */
        mm->mm_error = bp->b_error;
    }
    mm->mm_resid -= bp->b_bcount;
    resid = mm->mm_resid;
    mutex_exit(&mm->mm_mutex);

    /*
     * free up the resources allocated to cloned buffers.
     */
    bp_mapout(bp);
    biofini(bp);

    if (resid == 0) {

        /*
         * This is the last I/O operation
         * clean up and return the original buffer
         */
        if (mm->mm_error) {
            mm->mm_bp->b_flags |= B_ERROR;
            mm->mm_bp->b_error = mm->mm_error;
        }
        biodone(mm->mm_bp);
        mutex_destroy(&mm->mm_mutex);
        kmem_free(mm, mm->mm_size);
    }
    return (0);
}