fs/nfs/nfs_srv.c

	nfs_srv.c revision 5cb0d67909d9970a3e7adbea9422ca3fc88000bf
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
 */

/*
 *  Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
 *  All rights reserved.
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/stat.h>
#include <sys/errno.h>
#include <sys/sysmacros.h>
#include <sys/statvfs.h>
#include <sys/kmem.h>
#include <sys/kstat.h>
#include <sys/dirent.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/vtrace.h>
#include <sys/mode.h>
#include <sys/acl.h>
#include <sys/nbmlock.h>
#include <sys/policy.h>
#include <sys/sdt.h>

#include <rpc/types.h>
#include <rpc/auth.h>
#include <rpc/svc.h>

#include <nfs/nfs.h>
#include <nfs/export.h>
#include <nfs/nfs_cmd.h>

#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>

#include <sys/strsubr.h>

/*
 * These are the interface routines for the server side of the
 * Network File System.  See the NFS version 2 protocol specification
 * for a description of this interface.
 */

static int  sattr_to_vattr(struct nfssattr *, struct vattr *);
static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
            cred_t *);

/*
 * Some "over the wire" UNIX file types.  These are encoded
 * into the mode.  This needs to be fixed in the next rev.
 */
#define IFMT        0170000     /* type of file */
#define IFCHR       0020000     /* character special */
#define IFBLK       0060000     /* block special */
#define IFSOCK      0140000     /* socket */

u_longlong_t nfs2_srv_caller_id;

/*
 * Get file attributes.
 * Returns the current attributes of the file with the given fhandle.
 */
/* ARGSUSED */
void
rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
    struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    vnode_t *vp;
    struct vattr va;

    vp = nfs_fhtovp(fhp, exi);
    if (vp == NULL) {
        ns->ns_status = NFSERR_STALE;
        return;
    }

    /*
     * Do the getattr.
     */
    va.va_mask = AT_ALL;    /* we want all the attributes */

    error = rfs4_delegated_getattr(vp, &va, 0, cr);

    /* check for overflows */
    if (!error) {
        /* Lie about the object type for a referral */
        if (vn_is_nfs_reparse(vp, cr))
            va.va_type = VLNK;

        acl_perm(vp, exi, &va, cr);
        error = vattr_to_nattr(&va, &ns->ns_attr);
    }

    VN_RELE(vp);

    ns->ns_status = puterrno(error);
}
void *
rfs_getattr_getfh(fhandle_t *fhp)
{
    return (fhp);
}

/*
 * Set file attributes.
 * Sets the attributes of the file with the given fhandle.  Returns
 * the new attributes.
 */
/* ARGSUSED */
void
rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    int flag;
    int in_crit = 0;
    vnode_t *vp;
    struct vattr va;
    struct vattr bva;
    struct flock64 bf;
    caller_context_t ct;


    vp = nfs_fhtovp(&args->saa_fh, exi);
    if (vp == NULL) {
        ns->ns_status = NFSERR_STALE;
        return;
    }

    if (rdonly(ro, vp)) {
        VN_RELE(vp);
        ns->ns_status = NFSERR_ROFS;
        return;
    }

    error = sattr_to_vattr(&args->saa_sa, &va);
    if (error) {
        VN_RELE(vp);
        ns->ns_status = puterrno(error);
        return;
    }

    /*
     * If the client is requesting a change to the mtime,
     * but the nanosecond field is set to 1 billion, then
     * this is a flag to the server that it should set the
     * atime and mtime fields to the server's current time.
     * The 1 billion number actually came from the client
     * as 1 million, but the units in the over the wire
     * request are microseconds instead of nanoseconds.
     *
     * This is an overload of the protocol and should be
     * documented in the NFS Version 2 protocol specification.
     */
    if (va.va_mask & AT_MTIME) {
        if (va.va_mtime.tv_nsec == 1000000000) {
            gethrestime(&va.va_mtime);
            va.va_atime = va.va_mtime;
            va.va_mask |= AT_ATIME;
            flag = 0;
        } else
            flag = ATTR_UTIME;
    } else
        flag = 0;

    /*
     * If the filesystem is exported with nosuid, then mask off
     * the setuid and setgid bits.
     */
    if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
        (exi->exi_export.ex_flags & EX_NOSUID))
        va.va_mode &= ~(VSUID | VSGID);

    ct.cc_sysid = 0;
    ct.cc_pid = 0;
    ct.cc_caller_id = nfs2_srv_caller_id;
    ct.cc_flags = CC_DONTBLOCK;

    /*
     * We need to specially handle size changes because it is
     * possible for the client to create a file with modes
     * which indicate read-only, but with the file opened for
     * writing.  If the client then tries to set the size of
     * the file, then the normal access checking done in
     * VOP_SETATTR would prevent the client from doing so,
     * although it should be legal for it to do so.  To get
     * around this, we do the access checking for ourselves
     * and then use VOP_SPACE which doesn't do the access
     * checking which VOP_SETATTR does. VOP_SPACE can only
     * operate on VREG files, let VOP_SETATTR handle the other
     * extremely rare cases.
     * Also the client should not be allowed to change the
     * size of the file if there is a conflicting non-blocking
     * mandatory lock in the region of change.
     */
    if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
        if (nbl_need_check(vp)) {
            nbl_start_crit(vp, RW_READER);
            in_crit = 1;
        }

        bva.va_mask = AT_UID | AT_SIZE;

        error = VOP_GETATTR(vp, &bva, 0, cr, &ct);

        if (error) {
            if (in_crit)
                nbl_end_crit(vp);
            VN_RELE(vp);
            ns->ns_status = puterrno(error);
            return;
        }

        if (in_crit) {
            u_offset_t offset;
            ssize_t length;

            if (va.va_size < bva.va_size) {
                offset = va.va_size;
                length = bva.va_size - va.va_size;
            } else {
                offset = bva.va_size;
                length = va.va_size - bva.va_size;
            }
            if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
                NULL)) {
                error = EACCES;
            }
        }

        if (crgetuid(cr) == bva.va_uid && !error &&
            va.va_size != bva.va_size) {
            va.va_mask &= ~AT_SIZE;
            bf.l_type = F_WRLCK;
            bf.l_whence = 0;
            bf.l_start = (off64_t)va.va_size;
            bf.l_len = 0;
            bf.l_sysid = 0;
            bf.l_pid = 0;

            error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
                (offset_t)va.va_size, cr, &ct);
        }
        if (in_crit)
            nbl_end_crit(vp);
    } else
        error = 0;

    /*
     * Do the setattr.
     */
    if (!error && va.va_mask) {
        error = VOP_SETATTR(vp, &va, flag, cr, &ct);
    }

    /*
     * check if the monitor on either vop_space or vop_setattr detected
     * a delegation conflict and if so, mark the thread flag as
     * wouldblock so that the response is dropped and the client will
     * try again.
     */
    if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
        VN_RELE(vp);
        curthread->t_flag |= T_WOULDBLOCK;
        return;
    }

    if (!error) {
        va.va_mask = AT_ALL;    /* get everything */

        error = rfs4_delegated_getattr(vp, &va, 0, cr);

        /* check for overflows */
        if (!error) {
            acl_perm(vp, exi, &va, cr);
            error = vattr_to_nattr(&va, &ns->ns_attr);
        }
    }

    ct.cc_flags = 0;

    /*
     * Force modified metadata out to stable storage.
     */
    (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);

    VN_RELE(vp);

    ns->ns_status = puterrno(error);
}
void *
rfs_setattr_getfh(struct nfssaargs *args)
{
    return (&args->saa_fh);
}

/*
 * Directory lookup.
 * Returns an fhandle and file attributes for file name in a directory.
 */
/* ARGSUSED */
void
rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    vnode_t *dvp;
    vnode_t *vp;
    struct vattr va;
    fhandle_t *fhp = da->da_fhandle;
    struct sec_ol sec = {0, 0};
    bool_t publicfh_flag = FALSE, auth_weak = FALSE;
    char *name;
    struct sockaddr *ca;

    /*
     * Trusted Extension doesn't support NFSv2. MOUNT
     * will reject v2 clients. Need to prevent v2 client
     * access via WebNFS here.
     */
    if (is_system_labeled() && req->rq_vers == 2) {
        dr->dr_status = NFSERR_ACCES;
        return;
    }

    /*
     * Disallow NULL paths
     */
    if (da->da_name == NULL || *da->da_name == '\0') {
        dr->dr_status = NFSERR_ACCES;
        return;
    }

    /*
     * Allow lookups from the root - the default
     * location of the public filehandle.
     */
    if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
        dvp = rootdir;
        VN_HOLD(dvp);
    } else {
        dvp = nfs_fhtovp(fhp, exi);
        if (dvp == NULL) {
            dr->dr_status = NFSERR_STALE;
            return;
        }
    }

    /*
     * Not allow lookup beyond root.
     * If the filehandle matches a filehandle of the exi,
     * then the ".." refers beyond the root of an exported filesystem.
     */
    if (strcmp(da->da_name, "..") == 0 &&
        EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
        VN_RELE(dvp);
        dr->dr_status = NFSERR_NOENT;
        return;
    }

    ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
        MAXPATHLEN);

    if (name == NULL) {
        dr->dr_status = NFSERR_ACCES;
        return;
    }

    /*
     * If the public filehandle is used then allow
     * a multi-component lookup, i.e. evaluate
     * a pathname and follow symbolic links if
     * necessary.
     *
     * This may result in a vnode in another filesystem
     * which is OK as long as the filesystem is exported.
     */
    if (PUBLIC_FH2(fhp)) {
        publicfh_flag = TRUE;
        error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
            &sec);
    } else {
        /*
         * Do a normal single component lookup.
         */
        error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
            NULL, NULL, NULL);
    }

    if (name != da->da_name)
        kmem_free(name, MAXPATHLEN);


    if (!error) {
        va.va_mask = AT_ALL;    /* we want everything */

        error = rfs4_delegated_getattr(vp, &va, 0, cr);

        /* check for overflows */
        if (!error) {
            acl_perm(vp, exi, &va, cr);
            error = vattr_to_nattr(&va, &dr->dr_attr);
            if (!error) {
                if (sec.sec_flags & SEC_QUERY)
                    error = makefh_ol(&dr->dr_fhandle, exi,
                        sec.sec_index);
                else {
                    error = makefh(&dr->dr_fhandle, vp,
                        exi);
                    if (!error && publicfh_flag &&
                        !chk_clnt_sec(exi, req))
                        auth_weak = TRUE;
                }
            }
        }
        VN_RELE(vp);
    }

    VN_RELE(dvp);

    /*
     * If publicfh_flag is true then we have called rfs_publicfh_mclookup
     * and have obtained a new exportinfo in exi which needs to be
     * released. Note the the original exportinfo pointed to by exi
     * will be released by the caller, comon_dispatch.
     */
    if (publicfh_flag && exi != NULL)
        exi_rele(exi);

    /*
     * If it's public fh, no 0x81, and client's flavor is
     * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
     * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
     */
    if (auth_weak)
        dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
    else
        dr->dr_status = puterrno(error);
}
void *
rfs_lookup_getfh(struct nfsdiropargs *da)
{
    return (da->da_fhandle);
}

/*
 * Read symbolic link.
 * Returns the string in the symbolic link at the given fhandle.
 */
/* ARGSUSED */
void
rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
    struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    struct iovec iov;
    struct uio uio;
    vnode_t *vp;
    struct vattr va;
    struct sockaddr *ca;
    char *name = NULL;
    int is_referral = 0;

    vp = nfs_fhtovp(fhp, exi);
    if (vp == NULL) {
        rl->rl_data = NULL;
        rl->rl_status = NFSERR_STALE;
        return;
    }

    va.va_mask = AT_MODE;

    error = VOP_GETATTR(vp, &va, 0, cr, NULL);

    if (error) {
        VN_RELE(vp);
        rl->rl_data = NULL;
        rl->rl_status = puterrno(error);
        return;
    }

    if (MANDLOCK(vp, va.va_mode)) {
        VN_RELE(vp);
        rl->rl_data = NULL;
        rl->rl_status = NFSERR_ACCES;
        return;
    }

    /* We lied about the object type for a referral */
    if (vn_is_nfs_reparse(vp, cr))
        is_referral = 1;

    /*
     * XNFS and RFC1094 require us to return ENXIO if argument
     * is not a link. BUGID 1138002.
     */
    if (vp->v_type != VLNK && !is_referral) {
        VN_RELE(vp);
        rl->rl_data = NULL;
        rl->rl_status = NFSERR_NXIO;
        return;
    }

    /*
     * Allocate data for pathname.  This will be freed by rfs_rlfree.
     */
    rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);

    if (is_referral) {
        char *s;
        size_t strsz;

        /* Get an artificial symlink based on a referral */
        s = build_symlink(vp, cr, &strsz);
        global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
        DTRACE_PROBE2(nfs2serv__func__referral__reflink,
            vnode_t *, vp, char *, s);
        if (s == NULL)
            error = EINVAL;
        else {
            error = 0;
            (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
            rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
            kmem_free(s, strsz);
        }

    } else {

        /*
         * Set up io vector to read sym link data
         */
        iov.iov_base = rl->rl_data;
        iov.iov_len = NFS_MAXPATHLEN;
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_segflg = UIO_SYSSPACE;
        uio.uio_extflg = UIO_COPY_CACHED;
        uio.uio_loffset = (offset_t)0;
        uio.uio_resid = NFS_MAXPATHLEN;

        /*
         * Do the readlink.
         */
        error = VOP_READLINK(vp, &uio, cr, NULL);

        rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);

        if (!error)
            rl->rl_data[rl->rl_count] = '\0';

    }


    VN_RELE(vp);

    ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    name = nfscmd_convname(ca, exi, rl->rl_data,
        NFSCMD_CONV_OUTBOUND, MAXPATHLEN);

    if (name != NULL && name != rl->rl_data) {
        kmem_free(rl->rl_data, NFS_MAXPATHLEN);
        rl->rl_data = name;
    }

    /*
     * XNFS and RFC1094 require us to return ENXIO if argument
     * is not a link. UFS returns EINVAL if this is the case,
     * so we do the mapping here. BUGID 1138002.
     */
    if (error == EINVAL)
        rl->rl_status = NFSERR_NXIO;
    else
        rl->rl_status = puterrno(error);

}
void *
rfs_readlink_getfh(fhandle_t *fhp)
{
    return (fhp);
}
/*
 * Free data allocated by rfs_readlink
 */
void
rfs_rlfree(struct nfsrdlnres *rl)
{
    if (rl->rl_data != NULL)
        kmem_free(rl->rl_data, NFS_MAXPATHLEN);
}

static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);

/*
 * Read data.
 * Returns some data read from the file at the given fhandle.
 */
/* ARGSUSED */
void
rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    vnode_t *vp;
    int error;
    struct vattr va;
    struct iovec iov;
    struct uio uio;
    mblk_t *mp;
    int alloc_err = 0;
    int in_crit = 0;
    caller_context_t ct;

    vp = nfs_fhtovp(&ra->ra_fhandle, exi);
    if (vp == NULL) {
        rr->rr_data = NULL;
        rr->rr_status = NFSERR_STALE;
        return;
    }

    if (vp->v_type != VREG) {
        VN_RELE(vp);
        rr->rr_data = NULL;
        rr->rr_status = NFSERR_ISDIR;
        return;
    }

    ct.cc_sysid = 0;
    ct.cc_pid = 0;
    ct.cc_caller_id = nfs2_srv_caller_id;
    ct.cc_flags = CC_DONTBLOCK;

    /*
     * Enter the critical region before calling VOP_RWLOCK
     * to avoid a deadlock with write requests.
     */
    if (nbl_need_check(vp)) {
        nbl_start_crit(vp, RW_READER);
        if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
            0, NULL)) {
            nbl_end_crit(vp);
            VN_RELE(vp);
            rr->rr_data = NULL;
            rr->rr_status = NFSERR_ACCES;
            return;
        }
        in_crit = 1;
    }

    error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);

    /* check if a monitor detected a delegation conflict */
    if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
        VN_RELE(vp);
        /* mark as wouldblock so response is dropped */
        curthread->t_flag |= T_WOULDBLOCK;

        rr->rr_data = NULL;
        return;
    }

    va.va_mask = AT_ALL;

    error = VOP_GETATTR(vp, &va, 0, cr, &ct);

    if (error) {
        VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
        if (in_crit)
            nbl_end_crit(vp);

        VN_RELE(vp);
        rr->rr_data = NULL;
        rr->rr_status = puterrno(error);

        return;
    }

    /*
     * This is a kludge to allow reading of files created
     * with no read permission.  The owner of the file
     * is always allowed to read it.
     */
    if (crgetuid(cr) != va.va_uid) {
        error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);

        if (error) {
            /*
             * Exec is the same as read over the net because
             * of demand loading.
             */
            error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
        }
        if (error) {
            VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
            if (in_crit)
                nbl_end_crit(vp);
            VN_RELE(vp);
            rr->rr_data = NULL;
            rr->rr_status = puterrno(error);

            return;
        }
    }

    if (MANDLOCK(vp, va.va_mode)) {
        VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
        if (in_crit)
            nbl_end_crit(vp);

        VN_RELE(vp);
        rr->rr_data = NULL;
        rr->rr_status = NFSERR_ACCES;

        return;
    }

    rr->rr_ok.rrok_wlist_len = 0;
    rr->rr_ok.rrok_wlist = NULL;

    if ((u_offset_t)ra->ra_offset >= va.va_size) {
        rr->rr_count = 0;
        rr->rr_data = NULL;
        /*
         * In this case, status is NFS_OK, but there is no data
         * to encode. So set rr_mp to NULL.
         */
        rr->rr_mp = NULL;
        rr->rr_ok.rrok_wlist = ra->ra_wlist;
        if (rr->rr_ok.rrok_wlist)
            clist_zero_len(rr->rr_ok.rrok_wlist);
        goto done;
    }

    if (ra->ra_wlist) {
        mp = NULL;
        rr->rr_mp = NULL;
        (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
        if (ra->ra_count > iov.iov_len) {
            rr->rr_data = NULL;
            rr->rr_status = NFSERR_INVAL;
            goto done;
        }
    } else {
        /*
         * mp will contain the data to be sent out in the read reply.
         * This will be freed after the reply has been sent out (by the
         * driver).
         * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
         * that the call to xdrmblk_putmblk() never fails.
         */
        mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
            &alloc_err);
        ASSERT(mp != NULL);
        ASSERT(alloc_err == 0);

        rr->rr_mp = mp;

        /*
         * Set up io vector
         */
        iov.iov_base = (caddr_t)mp->b_datap->db_base;
        iov.iov_len = ra->ra_count;
    }

    uio.uio_iov = &iov;
    uio.uio_iovcnt = 1;
    uio.uio_segflg = UIO_SYSSPACE;
    uio.uio_extflg = UIO_COPY_CACHED;
    uio.uio_loffset = (offset_t)ra->ra_offset;
    uio.uio_resid = ra->ra_count;

    error = VOP_READ(vp, &uio, 0, cr, &ct);

    if (error) {
        if (mp)
            freeb(mp);

        /*
         * check if a monitor detected a delegation conflict and
         * mark as wouldblock so response is dropped
         */
        if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
            curthread->t_flag |= T_WOULDBLOCK;
        else
            rr->rr_status = puterrno(error);

        VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
        if (in_crit)
            nbl_end_crit(vp);

        VN_RELE(vp);
        rr->rr_data = NULL;

        return;
    }

    /*
     * Get attributes again so we can send the latest access
     * time to the client side for his cache.
     */
    va.va_mask = AT_ALL;

    error = VOP_GETATTR(vp, &va, 0, cr, &ct);

    if (error) {
        if (mp)
            freeb(mp);

        VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
        if (in_crit)
            nbl_end_crit(vp);

        VN_RELE(vp);
        rr->rr_data = NULL;
        rr->rr_status = puterrno(error);

        return;
    }

    rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);

    if (mp) {
        rr->rr_data = (char *)mp->b_datap->db_base;
    } else {
        if (ra->ra_wlist) {
            rr->rr_data = (caddr_t)iov.iov_base;
            if (!rdma_setup_read_data2(ra, rr)) {
                rr->rr_data = NULL;
                rr->rr_status = puterrno(NFSERR_INVAL);
            }
        }
    }
done:
    VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    if (in_crit)
        nbl_end_crit(vp);

    acl_perm(vp, exi, &va, cr);

    /* check for overflows */
    error = vattr_to_nattr(&va, &rr->rr_attr);

    VN_RELE(vp);

    rr->rr_status = puterrno(error);
}

/*
 * Free data allocated by rfs_read
 */
void
rfs_rdfree(struct nfsrdresult *rr)
{
    mblk_t *mp;

    if (rr->rr_status == NFS_OK) {
        mp = rr->rr_mp;
        if (mp != NULL)
            freeb(mp);
    }
}

void *
rfs_read_getfh(struct nfsreadargs *ra)
{
    return (&ra->ra_fhandle);
}

#define MAX_IOVECS  12

#ifdef DEBUG
static int rfs_write_sync_hits = 0;
static int rfs_write_sync_misses = 0;
#endif

/*
 * Write data to file.
 * Returns attributes of a file after writing some data to it.
 *
 * Any changes made here, especially in error handling might have
 * to also be done in rfs_write (which clusters write requests).
 */
/* ARGSUSED */
void
rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    vnode_t *vp;
    rlim64_t rlimit;
    struct vattr va;
    struct uio uio;
    struct iovec iov[MAX_IOVECS];
    mblk_t *m;
    struct iovec *iovp;
    int iovcnt;
    cred_t *savecred;
    int in_crit = 0;
    caller_context_t ct;

    vp = nfs_fhtovp(&wa->wa_fhandle, exi);
    if (vp == NULL) {
        ns->ns_status = NFSERR_STALE;
        return;
    }

    if (rdonly(ro, vp)) {
        VN_RELE(vp);
        ns->ns_status = NFSERR_ROFS;
        return;
    }

    if (vp->v_type != VREG) {
        VN_RELE(vp);
        ns->ns_status = NFSERR_ISDIR;
        return;
    }

    ct.cc_sysid = 0;
    ct.cc_pid = 0;
    ct.cc_caller_id = nfs2_srv_caller_id;
    ct.cc_flags = CC_DONTBLOCK;

    va.va_mask = AT_UID|AT_MODE;

    error = VOP_GETATTR(vp, &va, 0, cr, &ct);

    if (error) {
        VN_RELE(vp);
        ns->ns_status = puterrno(error);

        return;
    }

    if (crgetuid(cr) != va.va_uid) {
        /*
         * This is a kludge to allow writes of files created
         * with read only permission.  The owner of the file
         * is always allowed to write it.
         */
        error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);

        if (error) {
            VN_RELE(vp);
            ns->ns_status = puterrno(error);
            return;
        }
    }

    /*
     * Can't access a mandatory lock file.  This might cause
     * the NFS service thread to block forever waiting for a
     * lock to be released that will never be released.
     */
    if (MANDLOCK(vp, va.va_mode)) {
        VN_RELE(vp);
        ns->ns_status = NFSERR_ACCES;
        return;
    }

    /*
     * We have to enter the critical region before calling VOP_RWLOCK
     * to avoid a deadlock with ufs.
     */
    if (nbl_need_check(vp)) {
        nbl_start_crit(vp, RW_READER);
        in_crit = 1;
        if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
            wa->wa_count, 0, NULL)) {
            error = EACCES;
            goto out;
        }
    }

    error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);

    /* check if a monitor detected a delegation conflict */
    if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
        VN_RELE(vp);
        /* mark as wouldblock so response is dropped */
        curthread->t_flag |= T_WOULDBLOCK;
        return;
    }

    if (wa->wa_data || wa->wa_rlist) {
        /* Do the RDMA thing if necessary */
        if (wa->wa_rlist) {
            iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
            iov[0].iov_len = wa->wa_count;
        } else  {
            iov[0].iov_base = wa->wa_data;
            iov[0].iov_len = wa->wa_count;
        }
        uio.uio_iov = iov;
        uio.uio_iovcnt = 1;
        uio.uio_segflg = UIO_SYSSPACE;
        uio.uio_extflg = UIO_COPY_DEFAULT;
        uio.uio_loffset = (offset_t)wa->wa_offset;
        uio.uio_resid = wa->wa_count;
        /*
         * The limit is checked on the client. We
         * should allow any size writes here.
         */
        uio.uio_llimit = curproc->p_fsz_ctl;
        rlimit = uio.uio_llimit - wa->wa_offset;
        if (rlimit < (rlim64_t)uio.uio_resid)
            uio.uio_resid = (uint_t)rlimit;

        /*
         * for now we assume no append mode
         */
        /*
         * We're changing creds because VM may fault and we need
         * the cred of the current thread to be used if quota
         * checking is enabled.
         */
        savecred = curthread->t_cred;
        curthread->t_cred = cr;
        error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
        curthread->t_cred = savecred;
    } else {
        iovcnt = 0;
        for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
            iovcnt++;
        if (iovcnt <= MAX_IOVECS) {
#ifdef DEBUG
            rfs_write_sync_hits++;
#endif
            iovp = iov;
        } else {
#ifdef DEBUG
            rfs_write_sync_misses++;
#endif
            iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
        }
        mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
        uio.uio_iov = iovp;
        uio.uio_iovcnt = iovcnt;
        uio.uio_segflg = UIO_SYSSPACE;
        uio.uio_extflg = UIO_COPY_DEFAULT;
        uio.uio_loffset = (offset_t)wa->wa_offset;
        uio.uio_resid = wa->wa_count;
        /*
         * The limit is checked on the client. We
         * should allow any size writes here.
         */
        uio.uio_llimit = curproc->p_fsz_ctl;
        rlimit = uio.uio_llimit - wa->wa_offset;
        if (rlimit < (rlim64_t)uio.uio_resid)
            uio.uio_resid = (uint_t)rlimit;

        /*
         * For now we assume no append mode.
         */
        /*
         * We're changing creds because VM may fault and we need
         * the cred of the current thread to be used if quota
         * checking is enabled.
         */
        savecred = curthread->t_cred;
        curthread->t_cred = cr;
        error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
        curthread->t_cred = savecred;

        if (iovp != iov)
            kmem_free(iovp, sizeof (*iovp) * iovcnt);
    }

    VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);

    if (!error) {
        /*
         * Get attributes again so we send the latest mod
         * time to the client side for his cache.
         */
        va.va_mask = AT_ALL;    /* now we want everything */

        error = VOP_GETATTR(vp, &va, 0, cr, &ct);

        /* check for overflows */
        if (!error) {
            acl_perm(vp, exi, &va, cr);
            error = vattr_to_nattr(&va, &ns->ns_attr);
        }
    }

out:
    if (in_crit)
        nbl_end_crit(vp);
    VN_RELE(vp);

    /* check if a monitor detected a delegation conflict */
    if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
        /* mark as wouldblock so response is dropped */
        curthread->t_flag |= T_WOULDBLOCK;
    else
        ns->ns_status = puterrno(error);

}

struct rfs_async_write {
    struct nfswriteargs *wa;
    struct nfsattrstat *ns;
    struct svc_req *req;
    cred_t *cr;
    bool_t ro;
    kthread_t *thread;
    struct rfs_async_write *list;
};

struct rfs_async_write_list {
    fhandle_t *fhp;
    kcondvar_t cv;
    struct rfs_async_write *list;
    struct rfs_async_write_list *next;
};

static struct rfs_async_write_list *rfs_async_write_head = NULL;
static kmutex_t rfs_async_write_lock;
static int rfs_write_async = 1; /* enables write clustering if == 1 */

#define MAXCLIOVECS 42
#define RFSWRITE_INITVAL (enum nfsstat) -1

#ifdef DEBUG
static int rfs_write_hits = 0;
static int rfs_write_misses = 0;
#endif

/*
 * Write data to file.
 * Returns attributes of a file after writing some data to it.
 */
void
rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    vnode_t *vp;
    rlim64_t rlimit;
    struct vattr va;
    struct uio uio;
    struct rfs_async_write_list *lp;
    struct rfs_async_write_list *nlp;
    struct rfs_async_write *rp;
    struct rfs_async_write *nrp;
    struct rfs_async_write *trp;
    struct rfs_async_write *lrp;
    int data_written;
    int iovcnt;
    mblk_t *m;
    struct iovec *iovp;
    struct iovec *niovp;
    struct iovec iov[MAXCLIOVECS];
    int count;
    int rcount;
    uint_t off;
    uint_t len;
    struct rfs_async_write nrpsp;
    struct rfs_async_write_list nlpsp;
    ushort_t t_flag;
    cred_t *savecred;
    int in_crit = 0;
    caller_context_t ct;

    if (!rfs_write_async) {
        rfs_write_sync(wa, ns, exi, req, cr, ro);
        return;
    }

    /*
     * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
     * is considered an OK.
     */
    ns->ns_status = RFSWRITE_INITVAL;

    nrp = &nrpsp;
    nrp->wa = wa;
    nrp->ns = ns;
    nrp->req = req;
    nrp->cr = cr;
    nrp->ro = ro;
    nrp->thread = curthread;

    ASSERT(curthread->t_schedflag & TS_DONT_SWAP);

    /*
     * Look to see if there is already a cluster started
     * for this file.
     */
    mutex_enter(&rfs_async_write_lock);
    for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
        if (bcmp(&wa->wa_fhandle, lp->fhp,
            sizeof (fhandle_t)) == 0)
            break;
    }

    /*
     * If lp is non-NULL, then there is already a cluster
     * started.  We need to place ourselves in the cluster
     * list in the right place as determined by starting
     * offset.  Conflicts with non-blocking mandatory locked
     * regions will be checked when the cluster is processed.
     */
    if (lp != NULL) {
        rp = lp->list;
        trp = NULL;
        while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
            trp = rp;
            rp = rp->list;
        }
        nrp->list = rp;
        if (trp == NULL)
            lp->list = nrp;
        else
            trp->list = nrp;
        while (nrp->ns->ns_status == RFSWRITE_INITVAL)
            cv_wait(&lp->cv, &rfs_async_write_lock);
        mutex_exit(&rfs_async_write_lock);

        return;
    }

    /*
     * No cluster started yet, start one and add ourselves
     * to the list of clusters.
     */
    nrp->list = NULL;

    nlp = &nlpsp;
    nlp->fhp = &wa->wa_fhandle;
    cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
    nlp->list = nrp;
    nlp->next = NULL;

    if (rfs_async_write_head == NULL) {
        rfs_async_write_head = nlp;
    } else {
        lp = rfs_async_write_head;
        while (lp->next != NULL)
            lp = lp->next;
        lp->next = nlp;
    }
    mutex_exit(&rfs_async_write_lock);

    /*
     * Convert the file handle common to all of the requests
     * in this cluster to a vnode.
     */
    vp = nfs_fhtovp(&wa->wa_fhandle, exi);
    if (vp == NULL) {
        mutex_enter(&rfs_async_write_lock);
        if (rfs_async_write_head == nlp)
            rfs_async_write_head = nlp->next;
        else {
            lp = rfs_async_write_head;
            while (lp->next != nlp)
                lp = lp->next;
            lp->next = nlp->next;
        }
        t_flag = curthread->t_flag & T_WOULDBLOCK;
        for (rp = nlp->list; rp != NULL; rp = rp->list) {
            rp->ns->ns_status = NFSERR_STALE;
            rp->thread->t_flag |= t_flag;
        }
        cv_broadcast(&nlp->cv);
        mutex_exit(&rfs_async_write_lock);

        return;
    }

    /*
     * Can only write regular files.  Attempts to write any
     * other file types fail with EISDIR.
     */
    if (vp->v_type != VREG) {
        VN_RELE(vp);
        mutex_enter(&rfs_async_write_lock);
        if (rfs_async_write_head == nlp)
            rfs_async_write_head = nlp->next;
        else {
            lp = rfs_async_write_head;
            while (lp->next != nlp)
                lp = lp->next;
            lp->next = nlp->next;
        }
        t_flag = curthread->t_flag & T_WOULDBLOCK;
        for (rp = nlp->list; rp != NULL; rp = rp->list) {
            rp->ns->ns_status = NFSERR_ISDIR;
            rp->thread->t_flag |= t_flag;
        }
        cv_broadcast(&nlp->cv);
        mutex_exit(&rfs_async_write_lock);

        return;
    }

    /*
     * Enter the critical region before calling VOP_RWLOCK, to avoid a
     * deadlock with ufs.
     */
    if (nbl_need_check(vp)) {
        nbl_start_crit(vp, RW_READER);
        in_crit = 1;
    }

    ct.cc_sysid = 0;
    ct.cc_pid = 0;
    ct.cc_caller_id = nfs2_srv_caller_id;
    ct.cc_flags = CC_DONTBLOCK;

    /*
     * Lock the file for writing.  This operation provides
     * the delay which allows clusters to grow.
     */
    error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);

    /* check if a monitor detected a delegation conflict */
    if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
        if (in_crit)
            nbl_end_crit(vp);
        VN_RELE(vp);
        /* mark as wouldblock so response is dropped */
        curthread->t_flag |= T_WOULDBLOCK;
        mutex_enter(&rfs_async_write_lock);
        if (rfs_async_write_head == nlp)
            rfs_async_write_head = nlp->next;
        else {
            lp = rfs_async_write_head;
            while (lp->next != nlp)
                lp = lp->next;
            lp->next = nlp->next;
        }
        for (rp = nlp->list; rp != NULL; rp = rp->list) {
            if (rp->ns->ns_status == RFSWRITE_INITVAL) {
                rp->ns->ns_status = puterrno(error);
                rp->thread->t_flag |= T_WOULDBLOCK;
            }
        }
        cv_broadcast(&nlp->cv);
        mutex_exit(&rfs_async_write_lock);

        return;
    }

    /*
     * Disconnect this cluster from the list of clusters.
     * The cluster that is being dealt with must be fixed
     * in size after this point, so there is no reason
     * to leave it on the list so that new requests can
     * find it.
     *
     * The algorithm is that the first write request will
     * create a cluster, convert the file handle to a
     * vnode pointer, and then lock the file for writing.
     * This request is not likely to be clustered with
     * any others.  However, the next request will create
     * a new cluster and be blocked in VOP_RWLOCK while
     * the first request is being processed.  This delay
     * will allow more requests to be clustered in this
     * second cluster.
     */
    mutex_enter(&rfs_async_write_lock);
    if (rfs_async_write_head == nlp)
        rfs_async_write_head = nlp->next;
    else {
        lp = rfs_async_write_head;
        while (lp->next != nlp)
            lp = lp->next;
        lp->next = nlp->next;
    }
    mutex_exit(&rfs_async_write_lock);

    /*
     * Step through the list of requests in this cluster.
     * We need to check permissions to make sure that all
     * of the requests have sufficient permission to write
     * the file.  A cluster can be composed of requests
     * from different clients and different users on each
     * client.
     *
     * As a side effect, we also calculate the size of the
     * byte range that this cluster encompasses.
     */
    rp = nlp->list;
    off = rp->wa->wa_offset;
    len = (uint_t)0;
    do {
        if (rdonly(rp->ro, vp)) {
            rp->ns->ns_status = NFSERR_ROFS;
            t_flag = curthread->t_flag & T_WOULDBLOCK;
            rp->thread->t_flag |= t_flag;
            continue;
        }

        va.va_mask = AT_UID|AT_MODE;

        error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);

        if (!error) {
            if (crgetuid(rp->cr) != va.va_uid) {
                /*
                 * This is a kludge to allow writes of files
                 * created with read only permission.  The
                 * owner of the file is always allowed to
                 * write it.
                 */
                error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
            }
            if (!error && MANDLOCK(vp, va.va_mode))
                error = EACCES;
        }

        /*
         * Check for a conflict with a nbmand-locked region.
         */
        if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
            rp->wa->wa_count, 0, NULL)) {
            error = EACCES;
        }

        if (error) {
            rp->ns->ns_status = puterrno(error);
            t_flag = curthread->t_flag & T_WOULDBLOCK;
            rp->thread->t_flag |= t_flag;
            continue;
        }
        if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
            len = rp->wa->wa_offset + rp->wa->wa_count - off;
    } while ((rp = rp->list) != NULL);

    /*
     * Step through the cluster attempting to gather as many
     * requests which are contiguous as possible.  These
     * contiguous requests are handled via one call to VOP_WRITE
     * instead of different calls to VOP_WRITE.  We also keep
     * track of the fact that any data was written.
     */
    rp = nlp->list;
    data_written = 0;
    do {
        /*
         * Skip any requests which are already marked as having an
         * error.
         */
        if (rp->ns->ns_status != RFSWRITE_INITVAL) {
            rp = rp->list;
            continue;
        }

        /*
         * Count the number of iovec's which are required
         * to handle this set of requests.  One iovec is
         * needed for each data buffer, whether addressed
         * by wa_data or by the b_rptr pointers in the
         * mblk chains.
         */
        iovcnt = 0;
        lrp = rp;
        for (;;) {
            if (lrp->wa->wa_data || lrp->wa->wa_rlist)
                iovcnt++;
            else {
                m = lrp->wa->wa_mblk;
                while (m != NULL) {
                    iovcnt++;
                    m = m->b_cont;
                }
            }
            if (lrp->list == NULL ||
                lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
                lrp->wa->wa_offset + lrp->wa->wa_count !=
                lrp->list->wa->wa_offset) {
                lrp = lrp->list;
                break;
            }
            lrp = lrp->list;
        }

        if (iovcnt <= MAXCLIOVECS) {
#ifdef DEBUG
            rfs_write_hits++;
#endif
            niovp = iov;
        } else {
#ifdef DEBUG
            rfs_write_misses++;
#endif
            niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
        }
        /*
         * Put together the scatter/gather iovecs.
         */
        iovp = niovp;
        trp = rp;
        count = 0;
        do {
            if (trp->wa->wa_data || trp->wa->wa_rlist) {
                if (trp->wa->wa_rlist) {
                    iovp->iov_base =
                        (char *)((trp->wa->wa_rlist)->
                        u.c_daddr3);
                    iovp->iov_len = trp->wa->wa_count;
                } else  {
                    iovp->iov_base = trp->wa->wa_data;
                    iovp->iov_len = trp->wa->wa_count;
                }
                iovp++;
            } else {
                m = trp->wa->wa_mblk;
                rcount = trp->wa->wa_count;
                while (m != NULL) {
                    iovp->iov_base = (caddr_t)m->b_rptr;
                    iovp->iov_len = (m->b_wptr - m->b_rptr);
                    rcount -= iovp->iov_len;
                    if (rcount < 0)
                        iovp->iov_len += rcount;
                    iovp++;
                    if (rcount <= 0)
                        break;
                    m = m->b_cont;
                }
            }
            count += trp->wa->wa_count;
            trp = trp->list;
        } while (trp != lrp);

        uio.uio_iov = niovp;
        uio.uio_iovcnt = iovcnt;
        uio.uio_segflg = UIO_SYSSPACE;
        uio.uio_extflg = UIO_COPY_DEFAULT;
        uio.uio_loffset = (offset_t)rp->wa->wa_offset;
        uio.uio_resid = count;
        /*
         * The limit is checked on the client. We
         * should allow any size writes here.
         */
        uio.uio_llimit = curproc->p_fsz_ctl;
        rlimit = uio.uio_llimit - rp->wa->wa_offset;
        if (rlimit < (rlim64_t)uio.uio_resid)
            uio.uio_resid = (uint_t)rlimit;

        /*
         * For now we assume no append mode.
         */

        /*
         * We're changing creds because VM may fault
         * and we need the cred of the current
         * thread to be used if quota * checking is
         * enabled.
         */
        savecred = curthread->t_cred;
        curthread->t_cred = cr;
        error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
        curthread->t_cred = savecred;

        /* check if a monitor detected a delegation conflict */
        if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
            /* mark as wouldblock so response is dropped */
            curthread->t_flag |= T_WOULDBLOCK;

        if (niovp != iov)
            kmem_free(niovp, sizeof (*niovp) * iovcnt);

        if (!error) {
            data_written = 1;
            /*
             * Get attributes again so we send the latest mod
             * time to the client side for his cache.
             */
            va.va_mask = AT_ALL;    /* now we want everything */

            error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);

            if (!error)
                acl_perm(vp, exi, &va, rp->cr);
        }

        /*
         * Fill in the status responses for each request
         * which was just handled.  Also, copy the latest
         * attributes in to the attribute responses if
         * appropriate.
         */
        t_flag = curthread->t_flag & T_WOULDBLOCK;
        do {
            rp->thread->t_flag |= t_flag;
            /* check for overflows */
            if (!error) {
                error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
            }
            rp->ns->ns_status = puterrno(error);
            rp = rp->list;
        } while (rp != lrp);
    } while (rp != NULL);

    /*
     * If any data was written at all, then we need to flush
     * the data and metadata to stable storage.
     */
    if (data_written) {
        error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);

        if (!error) {
            error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
        }
    }

    VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);

    if (in_crit)
        nbl_end_crit(vp);
    VN_RELE(vp);

    t_flag = curthread->t_flag & T_WOULDBLOCK;
    mutex_enter(&rfs_async_write_lock);
    for (rp = nlp->list; rp != NULL; rp = rp->list) {
        if (rp->ns->ns_status == RFSWRITE_INITVAL) {
            rp->ns->ns_status = puterrno(error);
            rp->thread->t_flag |= t_flag;
        }
    }
    cv_broadcast(&nlp->cv);
    mutex_exit(&rfs_async_write_lock);

}

void *
rfs_write_getfh(struct nfswriteargs *wa)
{
    return (&wa->wa_fhandle);
}

/*
 * Create a file.
 * Creates a file with given attributes and returns those attributes
 * and an fhandle for the new file.
 */
void
rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    int lookuperr;
    int in_crit = 0;
    struct vattr va;
    vnode_t *vp;
    vnode_t *realvp;
    vnode_t *dvp;
    char *name = args->ca_da.da_name;
    vnode_t *tvp = NULL;
    int mode;
    int lookup_ok;
    bool_t trunc;
    struct sockaddr *ca;

    /*
     * Disallow NULL paths
     */
    if (name == NULL || *name == '\0') {
        dr->dr_status = NFSERR_ACCES;
        return;
    }

    dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
    if (dvp == NULL) {
        dr->dr_status = NFSERR_STALE;
        return;
    }

    error = sattr_to_vattr(args->ca_sa, &va);
    if (error) {
        dr->dr_status = puterrno(error);
        return;
    }

    /*
     * Must specify the mode.
     */
    if (!(va.va_mask & AT_MODE)) {
        VN_RELE(dvp);
        dr->dr_status = NFSERR_INVAL;
        return;
    }

    /*
     * This is a completely gross hack to make mknod
     * work over the wire until we can wack the protocol
     */
    if ((va.va_mode & IFMT) == IFCHR) {
        if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
            va.va_type = VFIFO; /* xtra kludge for named pipe */
        else {
            va.va_type = VCHR;
            /*
             * uncompress the received dev_t
             * if the top half is zero indicating a request
             * from an `older style' OS.
             */
            if ((va.va_size & 0xffff0000) == 0)
                va.va_rdev = nfsv2_expdev(va.va_size);
            else
                va.va_rdev = (dev_t)va.va_size;
        }
        va.va_mask &= ~AT_SIZE;
    } else if ((va.va_mode & IFMT) == IFBLK) {
        va.va_type = VBLK;
        /*
         * uncompress the received dev_t
         * if the top half is zero indicating a request
         * from an `older style' OS.
         */
        if ((va.va_size & 0xffff0000) == 0)
            va.va_rdev = nfsv2_expdev(va.va_size);
        else
            va.va_rdev = (dev_t)va.va_size;
        va.va_mask &= ~AT_SIZE;
    } else if ((va.va_mode & IFMT) == IFSOCK) {
        va.va_type = VSOCK;
    } else {
        va.va_type = VREG;
    }
    va.va_mode &= ~IFMT;
    va.va_mask |= AT_TYPE;

    ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
        MAXPATHLEN);
    if (name == NULL) {
        dr->dr_status = puterrno(EINVAL);
        return;
    }

    /*
     * Why was the choice made to use VWRITE as the mode to the
     * call to VOP_CREATE ? This results in a bug.  When a client
     * opens a file that already exists and is RDONLY, the second
     * open fails with an EACESS because of the mode.
     * bug ID 1054648.
     */
    lookup_ok = 0;
    mode = VWRITE;
    if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
        error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
            NULL, NULL, NULL);
        if (!error) {
            struct vattr at;

            lookup_ok = 1;
            at.va_mask = AT_MODE;
            error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
            if (!error)
                mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
            VN_RELE(tvp);
            tvp = NULL;
        }
    }

    if (!lookup_ok) {
        if (rdonly(ro, dvp)) {
            error = EROFS;
        } else if (va.va_type != VREG && va.va_type != VFIFO &&
            va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
            error = EPERM;
        } else {
            error = 0;
        }
    }

    /*
     * If file size is being modified on an already existing file
     * make sure that there are no conflicting non-blocking mandatory
     * locks in the region being manipulated. Return EACCES if there
     * are conflicting locks.
     */
    if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
        lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
            NULL, NULL, NULL);

        if (!lookuperr &&
            rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
            VN_RELE(tvp);
            curthread->t_flag |= T_WOULDBLOCK;
            goto out;
        }

        if (!lookuperr && nbl_need_check(tvp)) {
            /*
             * The file exists. Now check if it has any
             * conflicting non-blocking mandatory locks
             * in the region being changed.
             */
            struct vattr bva;
            u_offset_t offset;
            ssize_t length;

            nbl_start_crit(tvp, RW_READER);
            in_crit = 1;

            bva.va_mask = AT_SIZE;
            error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
            if (!error) {
                if (va.va_size < bva.va_size) {
                    offset = va.va_size;
                    length = bva.va_size - va.va_size;
                } else {
                    offset = bva.va_size;
                    length = va.va_size - bva.va_size;
                }
                if (length) {
                    if (nbl_conflict(tvp, NBL_WRITE,
                        offset, length, 0, NULL)) {
                        error = EACCES;
                    }
                }
            }
            if (error) {
                nbl_end_crit(tvp);
                VN_RELE(tvp);
                in_crit = 0;
            }
        } else if (tvp != NULL) {
            VN_RELE(tvp);
        }
    }

    if (!error) {
        /*
         * If filesystem is shared with nosuid the remove any
         * setuid/setgid bits on create.
         */
        if (va.va_type == VREG &&
            exi->exi_export.ex_flags & EX_NOSUID)
            va.va_mode &= ~(VSUID | VSGID);

        error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
            NULL, NULL);

        if (!error) {

            if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
                trunc = TRUE;
            else
                trunc = FALSE;

            if (rfs4_check_delegated(FWRITE, vp, trunc)) {
                VN_RELE(vp);
                curthread->t_flag |= T_WOULDBLOCK;
                goto out;
            }
            va.va_mask = AT_ALL;

            error = VOP_GETATTR(vp, &va, 0, cr, NULL);

            /* check for overflows */
            if (!error) {
                acl_perm(vp, exi, &va, cr);
                error = vattr_to_nattr(&va, &dr->dr_attr);
                if (!error) {
                    error = makefh(&dr->dr_fhandle, vp,
                        exi);
                }
            }
            /*
             * Force modified metadata out to stable storage.
             *
             * if a underlying vp exists, pass it to VOP_FSYNC
             */
            if (VOP_REALVP(vp, &realvp, NULL) == 0)
                (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
            else
                (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
            VN_RELE(vp);
        }

        if (in_crit) {
            nbl_end_crit(tvp);
            VN_RELE(tvp);
        }
    }

    /*
     * Force modified data and metadata out to stable storage.
     */
    (void) VOP_FSYNC(dvp, 0, cr, NULL);

out:

    VN_RELE(dvp);

    dr->dr_status = puterrno(error);

    if (name != args->ca_da.da_name)
        kmem_free(name, MAXPATHLEN);
}
void *
rfs_create_getfh(struct nfscreatargs *args)
{
    return (args->ca_da.da_fhandle);
}

/*
 * Remove a file.
 * Remove named file from parent directory.
 */
/* ARGSUSED */
void
rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error = 0;
    vnode_t *vp;
    vnode_t *targvp;
    int in_crit = 0;

    /*
     * Disallow NULL paths
     */
    if (da->da_name == NULL || *da->da_name == '\0') {
        *status = NFSERR_ACCES;
        return;
    }

    vp = nfs_fhtovp(da->da_fhandle, exi);
    if (vp == NULL) {
        *status = NFSERR_STALE;
        return;
    }

    if (rdonly(ro, vp)) {
        VN_RELE(vp);
        *status = NFSERR_ROFS;
        return;
    }

    /*
     * Check for a conflict with a non-blocking mandatory share reservation.
     */
    error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
        NULL, cr, NULL, NULL, NULL);
    if (error != 0) {
        VN_RELE(vp);
        *status = puterrno(error);
        return;
    }

    /*
     * If the file is delegated to an v4 client, then initiate
     * recall and drop this request (by setting T_WOULDBLOCK).
     * The client will eventually re-transmit the request and
     * (hopefully), by then, the v4 client will have returned
     * the delegation.
     */

    if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
        VN_RELE(vp);
        VN_RELE(targvp);
        curthread->t_flag |= T_WOULDBLOCK;
        return;
    }

    if (nbl_need_check(targvp)) {
        nbl_start_crit(targvp, RW_READER);
        in_crit = 1;
        if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
            error = EACCES;
            goto out;
        }
    }

    error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);

    /*
     * Force modified data and metadata out to stable storage.
     */
    (void) VOP_FSYNC(vp, 0, cr, NULL);

out:
    if (in_crit)
        nbl_end_crit(targvp);
    VN_RELE(targvp);
    VN_RELE(vp);

    *status = puterrno(error);

}

void *
rfs_remove_getfh(struct nfsdiropargs *da)
{
    return (da->da_fhandle);
}

/*
 * rename a file
 * Give a file (from) a new name (to).
 */
/* ARGSUSED */
void
rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error = 0;
    vnode_t *fromvp;
    vnode_t *tovp;
    struct exportinfo *to_exi;
    fhandle_t *fh;
    vnode_t *srcvp;
    vnode_t *targvp;
    int in_crit = 0;

    fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
    if (fromvp == NULL) {
        *status = NFSERR_STALE;
        return;
    }

    fh = args->rna_to.da_fhandle;
    to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
    if (to_exi == NULL) {
        VN_RELE(fromvp);
        *status = NFSERR_ACCES;
        return;
    }
    exi_rele(to_exi);

    if (to_exi != exi) {
        VN_RELE(fromvp);
        *status = NFSERR_XDEV;
        return;
    }

    tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
    if (tovp == NULL) {
        VN_RELE(fromvp);
        *status = NFSERR_STALE;
        return;
    }

    if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        *status = NFSERR_NOTDIR;
        return;
    }

    /*
     * Disallow NULL paths
     */
    if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
        args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        *status = NFSERR_ACCES;
        return;
    }

    if (rdonly(ro, tovp)) {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        *status = NFSERR_ROFS;
        return;
    }

    /*
     * Check for a conflict with a non-blocking mandatory share reservation.
     */
    error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
        NULL, cr, NULL, NULL, NULL);
    if (error != 0) {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        *status = puterrno(error);
        return;
    }

    /* Check for delegations on the source file */

    if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        VN_RELE(srcvp);
        curthread->t_flag |= T_WOULDBLOCK;
        return;
    }

    /* Check for delegation on the file being renamed over, if it exists */

    if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
        VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
        NULL, NULL, NULL) == 0) {

        if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
            VN_RELE(tovp);
            VN_RELE(fromvp);
            VN_RELE(srcvp);
            VN_RELE(targvp);
            curthread->t_flag |= T_WOULDBLOCK;
            return;
        }
        VN_RELE(targvp);
    }


    if (nbl_need_check(srcvp)) {
        nbl_start_crit(srcvp, RW_READER);
        in_crit = 1;
        if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
            error = EACCES;
            goto out;
        }
    }

    error = VOP_RENAME(fromvp, args->rna_from.da_name,
        tovp, args->rna_to.da_name, cr, NULL, 0);

    if (error == 0)
        vn_renamepath(tovp, srcvp, args->rna_to.da_name,
            strlen(args->rna_to.da_name));

    /*
     * Force modified data and metadata out to stable storage.
     */
    (void) VOP_FSYNC(tovp, 0, cr, NULL);
    (void) VOP_FSYNC(fromvp, 0, cr, NULL);

out:
    if (in_crit)
        nbl_end_crit(srcvp);
    VN_RELE(srcvp);
    VN_RELE(tovp);
    VN_RELE(fromvp);

    *status = puterrno(error);

}
void *
rfs_rename_getfh(struct nfsrnmargs *args)
{
    return (args->rna_from.da_fhandle);
}

/*
 * Link to a file.
 * Create a file (to) which is a hard link to the given file (from).
 */
/* ARGSUSED */
void
rfs_link(struct nfslinkargs *args, enum nfsstat *status,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    vnode_t *fromvp;
    vnode_t *tovp;
    struct exportinfo *to_exi;
    fhandle_t *fh;

    fromvp = nfs_fhtovp(args->la_from, exi);
    if (fromvp == NULL) {
        *status = NFSERR_STALE;
        return;
    }

    fh = args->la_to.da_fhandle;
    to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
    if (to_exi == NULL) {
        VN_RELE(fromvp);
        *status = NFSERR_ACCES;
        return;
    }
    exi_rele(to_exi);

    if (to_exi != exi) {
        VN_RELE(fromvp);
        *status = NFSERR_XDEV;
        return;
    }

    tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
    if (tovp == NULL) {
        VN_RELE(fromvp);
        *status = NFSERR_STALE;
        return;
    }

    if (tovp->v_type != VDIR) {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        *status = NFSERR_NOTDIR;
        return;
    }
    /*
     * Disallow NULL paths
     */
    if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        *status = NFSERR_ACCES;
        return;
    }

    if (rdonly(ro, tovp)) {
        VN_RELE(tovp);
        VN_RELE(fromvp);
        *status = NFSERR_ROFS;
        return;
    }

    error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);

    /*
     * Force modified data and metadata out to stable storage.
     */
    (void) VOP_FSYNC(tovp, 0, cr, NULL);
    (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);

    VN_RELE(tovp);
    VN_RELE(fromvp);

    *status = puterrno(error);

}
void *
rfs_link_getfh(struct nfslinkargs *args)
{
    return (args->la_from);
}

/*
 * Symbolicly link to a file.
 * Create a file (to) with the given attributes which is a symbolic link
 * to the given path name (to).
 */
void
rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    struct vattr va;
    vnode_t *vp;
    vnode_t *svp;
    int lerror;
    struct sockaddr *ca;
    char *name = NULL;

    /*
     * Disallow NULL paths
     */
    if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
        *status = NFSERR_ACCES;
        return;
    }

    vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
    if (vp == NULL) {
        *status = NFSERR_STALE;
        return;
    }

    if (rdonly(ro, vp)) {
        VN_RELE(vp);
        *status = NFSERR_ROFS;
        return;
    }

    error = sattr_to_vattr(args->sla_sa, &va);
    if (error) {
        VN_RELE(vp);
        *status = puterrno(error);
        return;
    }

    if (!(va.va_mask & AT_MODE)) {
        VN_RELE(vp);
        *status = NFSERR_INVAL;
        return;
    }

    ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    name = nfscmd_convname(ca, exi, args->sla_tnm,
        NFSCMD_CONV_INBOUND, MAXPATHLEN);

    if (name == NULL) {
        *status = NFSERR_ACCES;
        return;
    }

    va.va_type = VLNK;
    va.va_mask |= AT_TYPE;

    error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);

    /*
     * Force new data and metadata out to stable storage.
     */
    lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
        NULL, cr, NULL, NULL, NULL);

    if (!lerror) {
        (void) VOP_FSYNC(svp, 0, cr, NULL);
        VN_RELE(svp);
    }

    /*
     * Force modified data and metadata out to stable storage.
     */
    (void) VOP_FSYNC(vp, 0, cr, NULL);

    VN_RELE(vp);

    *status = puterrno(error);
    if (name != args->sla_tnm)
        kmem_free(name, MAXPATHLEN);

}
void *
rfs_symlink_getfh(struct nfsslargs *args)
{
    return (args->sla_from.da_fhandle);
}

/*
 * Make a directory.
 * Create a directory with the given name, parent directory, and attributes.
 * Returns a file handle and attributes for the new directory.
 */
/* ARGSUSED */
void
rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    struct vattr va;
    vnode_t *dvp = NULL;
    vnode_t *vp;
    char *name = args->ca_da.da_name;

    /*
     * Disallow NULL paths
     */
    if (name == NULL || *name == '\0') {
        dr->dr_status = NFSERR_ACCES;
        return;
    }

    vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
    if (vp == NULL) {
        dr->dr_status = NFSERR_STALE;
        return;
    }

    if (rdonly(ro, vp)) {
        VN_RELE(vp);
        dr->dr_status = NFSERR_ROFS;
        return;
    }

    error = sattr_to_vattr(args->ca_sa, &va);
    if (error) {
        VN_RELE(vp);
        dr->dr_status = puterrno(error);
        return;
    }

    if (!(va.va_mask & AT_MODE)) {
        VN_RELE(vp);
        dr->dr_status = NFSERR_INVAL;
        return;
    }

    va.va_type = VDIR;
    va.va_mask |= AT_TYPE;

    error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);

    if (!error) {
        /*
         * Attribtutes of the newly created directory should
         * be returned to the client.
         */
        va.va_mask = AT_ALL; /* We want everything */
        error = VOP_GETATTR(dvp, &va, 0, cr, NULL);

        /* check for overflows */
        if (!error) {
            acl_perm(vp, exi, &va, cr);
            error = vattr_to_nattr(&va, &dr->dr_attr);
            if (!error) {
                error = makefh(&dr->dr_fhandle, dvp, exi);
            }
        }
        /*
         * Force new data and metadata out to stable storage.
         */
        (void) VOP_FSYNC(dvp, 0, cr, NULL);
        VN_RELE(dvp);
    }

    /*
     * Force modified data and metadata out to stable storage.
     */
    (void) VOP_FSYNC(vp, 0, cr, NULL);

    VN_RELE(vp);

    dr->dr_status = puterrno(error);

}
void *
rfs_mkdir_getfh(struct nfscreatargs *args)
{
    return (args->ca_da.da_fhandle);
}

/*
 * Remove a directory.
 * Remove the given directory name from the given parent directory.
 */
/* ARGSUSED */
void
rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    vnode_t *vp;

    /*
     * Disallow NULL paths
     */
    if (da->da_name == NULL || *da->da_name == '\0') {
        *status = NFSERR_ACCES;
        return;
    }

    vp = nfs_fhtovp(da->da_fhandle, exi);
    if (vp == NULL) {
        *status = NFSERR_STALE;
        return;
    }

    if (rdonly(ro, vp)) {
        VN_RELE(vp);
        *status = NFSERR_ROFS;
        return;
    }

    /*
     * VOP_RMDIR takes a third argument (the current
     * directory of the process).  That's because someone
     * wants to return EINVAL if one tries to remove ".".
     * Of course, NFS servers have no idea what their
     * clients' current directories are.  We fake it by
     * supplying a vnode known to exist and illegal to
     * remove.
     */
    error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);

    /*
     * Force modified data and metadata out to stable storage.
     */
    (void) VOP_FSYNC(vp, 0, cr, NULL);

    VN_RELE(vp);

    /*
     * System V defines rmdir to return EEXIST, not ENOTEMPTY,
     * if the directory is not empty.  A System V NFS server
     * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
     * over the wire.
     */
    if (error == EEXIST)
        *status = NFSERR_NOTEMPTY;
    else
        *status = puterrno(error);

}
void *
rfs_rmdir_getfh(struct nfsdiropargs *da)
{
    return (da->da_fhandle);
}

/* ARGSUSED */
void
rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
    struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    int iseof;
    struct iovec iov;
    struct uio uio;
    vnode_t *vp;
    char *ndata = NULL;
    struct sockaddr *ca;
    size_t nents;
    int ret;

    vp = nfs_fhtovp(&rda->rda_fh, exi);
    if (vp == NULL) {
        rd->rd_entries = NULL;
        rd->rd_status = NFSERR_STALE;
        return;
    }

    if (vp->v_type != VDIR) {
        VN_RELE(vp);
        rd->rd_entries = NULL;
        rd->rd_status = NFSERR_NOTDIR;
        return;
    }

    (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);

    error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);

    if (error) {
        rd->rd_entries = NULL;
        goto bad;
    }

    if (rda->rda_count == 0) {
        rd->rd_entries = NULL;
        rd->rd_size = 0;
        rd->rd_eof = FALSE;
        goto bad;
    }

    rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);

    /*
     * Allocate data for entries.  This will be freed by rfs_rddirfree.
     */
    rd->rd_bufsize = (uint_t)rda->rda_count;
    rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);

    /*
     * Set up io vector to read directory data
     */
    iov.iov_base = (caddr_t)rd->rd_entries;
    iov.iov_len = rda->rda_count;
    uio.uio_iov = &iov;
    uio.uio_iovcnt = 1;
    uio.uio_segflg = UIO_SYSSPACE;
    uio.uio_extflg = UIO_COPY_CACHED;
    uio.uio_loffset = (offset_t)rda->rda_offset;
    uio.uio_resid = rda->rda_count;

    /*
     * read directory
     */
    error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);

    /*
     * Clean up
     */
    if (!error) {
        /*
         * set size and eof
         */
        if (uio.uio_resid == rda->rda_count) {
            rd->rd_size = 0;
            rd->rd_eof = TRUE;
        } else {
            rd->rd_size = (uint32_t)(rda->rda_count -
                uio.uio_resid);
            rd->rd_eof = iseof ? TRUE : FALSE;
        }
    }

    ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
    ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
        rda->rda_count, &ndata);

    if (ret != 0) {
        size_t dropbytes;
        /*
         * We had to drop one or more entries in order to fit
         * during the character conversion.  We need to patch
         * up the size and eof info.
         */
        if (rd->rd_eof)
            rd->rd_eof = FALSE;
        dropbytes = nfscmd_dropped_entrysize(
            (struct dirent64 *)rd->rd_entries, nents, ret);
        rd->rd_size -= dropbytes;
    }
    if (ndata == NULL) {
        ndata = (char *)rd->rd_entries;
    } else if (ndata != (char *)rd->rd_entries) {
        kmem_free(rd->rd_entries, rd->rd_bufsize);
        rd->rd_entries = (void *)ndata;
        rd->rd_bufsize = rda->rda_count;
    }

bad:
    VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);

#if 0 /* notyet */
    /*
     * Don't do this.  It causes local disk writes when just
     * reading the file and the overhead is deemed larger
     * than the benefit.
     */
    /*
     * Force modified metadata out to stable storage.
     */
    (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
#endif

    VN_RELE(vp);

    rd->rd_status = puterrno(error);

}
void *
rfs_readdir_getfh(struct nfsrddirargs *rda)
{
    return (&rda->rda_fh);
}
void
rfs_rddirfree(struct nfsrddirres *rd)
{
    if (rd->rd_entries != NULL)
        kmem_free(rd->rd_entries, rd->rd_bufsize);
}

/* ARGSUSED */
void
rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
    struct svc_req *req, cred_t *cr, bool_t ro)
{
    int error;
    struct statvfs64 sb;
    vnode_t *vp;

    vp = nfs_fhtovp(fh, exi);
    if (vp == NULL) {
        fs->fs_status = NFSERR_STALE;
        return;
    }

    error = VFS_STATVFS(vp->v_vfsp, &sb);

    if (!error) {
        fs->fs_tsize = nfstsize();
        fs->fs_bsize = sb.f_frsize;
        fs->fs_blocks = sb.f_blocks;
        fs->fs_bfree = sb.f_bfree;
        fs->fs_bavail = sb.f_bavail;
    }

    VN_RELE(vp);

    fs->fs_status = puterrno(error);

}
void *
rfs_statfs_getfh(fhandle_t *fh)
{
    return (fh);
}

static int
sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
{
    vap->va_mask = 0;

    /*
     * There was a sign extension bug in some VFS based systems
     * which stored the mode as a short.  When it would get
     * assigned to a u_long, no sign extension would occur.
     * It needed to, but this wasn't noticed because sa_mode
     * would then get assigned back to the short, thus ignoring
     * the upper 16 bits of sa_mode.
     *
     * To make this implementation work for both broken
     * clients and good clients, we check for both versions
     * of the mode.
     */
    if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
        sa->sa_mode != (uint32_t)-1) {
        vap->va_mask |= AT_MODE;
        vap->va_mode = sa->sa_mode;
    }
    if (sa->sa_uid != (uint32_t)-1) {
        vap->va_mask |= AT_UID;
        vap->va_uid = sa->sa_uid;
    }
    if (sa->sa_gid != (uint32_t)-1) {
        vap->va_mask |= AT_GID;
        vap->va_gid = sa->sa_gid;
    }
    if (sa->sa_size != (uint32_t)-1) {
        vap->va_mask |= AT_SIZE;
        vap->va_size = sa->sa_size;
    }
    if (sa->sa_atime.tv_sec != (int32_t)-1 &&
        sa->sa_atime.tv_usec != (int32_t)-1) {
#ifndef _LP64
        /* return error if time overflow */
        if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
            return (EOVERFLOW);
#endif
        vap->va_mask |= AT_ATIME;
        /*
         * nfs protocol defines times as unsigned so don't extend sign,
         * unless sysadmin set nfs_allow_preepoch_time.
         */
        NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
        vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
    }
    if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
        sa->sa_mtime.tv_usec != (int32_t)-1) {
#ifndef _LP64
        /* return error if time overflow */
        if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
            return (EOVERFLOW);
#endif
        vap->va_mask |= AT_MTIME;
        /*
         * nfs protocol defines times as unsigned so don't extend sign,
         * unless sysadmin set nfs_allow_preepoch_time.
         */
        NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
        vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
    }
    return (0);
}

static enum nfsftype vt_to_nf[] = {
    0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
};

/*
 * check the following fields for overflow: nodeid, size, and time.
 * There could be a problem when converting 64-bit LP64 fields
 * into 32-bit ones.  Return an error if there is an overflow.
 */
int
vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
{
    ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
    na->na_type = vt_to_nf[vap->va_type];

    if (vap->va_mode == (unsigned short) -1)
        na->na_mode = (uint32_t)-1;
    else
        na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;

    if (vap->va_uid == (unsigned short)(-1))
        na->na_uid = (uint32_t)(-1);
    else if (vap->va_uid == UID_NOBODY)
        na->na_uid = (uint32_t)NFS_UID_NOBODY;
    else
        na->na_uid = vap->va_uid;

    if (vap->va_gid == (unsigned short)(-1))
        na->na_gid = (uint32_t)-1;
    else if (vap->va_gid == GID_NOBODY)
        na->na_gid = (uint32_t)NFS_GID_NOBODY;
    else
        na->na_gid = vap->va_gid;

    /*
     * Do we need to check fsid for overflow?  It is 64-bit in the
     * vattr, but are bigger than 32 bit values supported?
     */
    na->na_fsid = vap->va_fsid;

    na->na_nodeid = vap->va_nodeid;

    /*
     * Check to make sure that the nodeid is representable over the
     * wire without losing bits.
     */
    if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
        return (EFBIG);
    na->na_nlink = vap->va_nlink;

    /*
     * Check for big files here, instead of at the caller.  See
     * comments in cstat for large special file explanation.
     */
    if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
        if ((vap->va_type == VREG) || (vap->va_type == VDIR))
            return (EFBIG);
        if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
            /* UNKNOWN_SIZE | OVERFLOW */
            na->na_size = MAXOFF32_T;
        } else
            na->na_size = vap->va_size;
    } else
        na->na_size = vap->va_size;

    /*
     * If the vnode times overflow the 32-bit times that NFS2
     * uses on the wire then return an error.
     */
    if (!NFS_VAP_TIME_OK(vap)) {
        return (EOVERFLOW);
    }
    na->na_atime.tv_sec = vap->va_atime.tv_sec;
    na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;

    na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
    na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;

    na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
    na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;

    /*
     * If the dev_t will fit into 16 bits then compress
     * it, otherwise leave it alone. See comments in
     * nfs_client.c.
     */
    if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
        getmajor(vap->va_rdev) <= SO4_MAXMAJ)
        na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
    else
        (void) cmpldev(&na->na_rdev, vap->va_rdev);

    na->na_blocks = vap->va_nblocks;
    na->na_blocksize = vap->va_blksize;

    /*
     * This bit of ugliness is a *TEMPORARY* hack to preserve the
     * over-the-wire protocols for named-pipe vnodes.  It remaps the
     * VFIFO type to the special over-the-wire type. (see note in nfs.h)
     *
     * BUYER BEWARE:
     *  If you are porting the NFS to a non-Sun server, you probably
     *  don't want to include the following block of code.  The
     *  over-the-wire special file types will be changing with the
     *  NFS Protocol Revision.
     */
    if (vap->va_type == VFIFO)
        NA_SETFIFO(na);
    return (0);
}

/*
 * acl v2 support: returns approximate permission.
 *  default: returns minimal permission (more restrictive)
 *  aclok: returns maximal permission (less restrictive)
 *  This routine changes the permissions that are alaredy in *va.
 *  If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
 *  CLASS_OBJ is always the same as GROUP_OBJ entry.
 */
static void
acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
{
    vsecattr_t  vsa;
    int     aclcnt;
    aclent_t    *aclentp;
    mode_t      mask_perm;
    mode_t      grp_perm;
    mode_t      other_perm;
    mode_t      other_orig;
    int     error;

    /* dont care default acl */
    vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
    error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);

    if (!error) {
        aclcnt = vsa.vsa_aclcnt;
        if (aclcnt > MIN_ACL_ENTRIES) {
            /* non-trivial ACL */
            aclentp = vsa.vsa_aclentp;
            if (exi->exi_export.ex_flags & EX_ACLOK) {
                /* maximal permissions */
                grp_perm = 0;
                other_perm = 0;
                for (; aclcnt > 0; aclcnt--, aclentp++) {
                    switch (aclentp->a_type) {
                    case USER_OBJ:
                        break;
                    case USER:
                        grp_perm |=
                            aclentp->a_perm << 3;
                        other_perm |= aclentp->a_perm;
                        break;
                    case GROUP_OBJ:
                        grp_perm |=
                            aclentp->a_perm << 3;
                        break;
                    case GROUP:
                        other_perm |= aclentp->a_perm;
                        break;
                    case OTHER_OBJ:
                        other_orig = aclentp->a_perm;
                        break;
                    case CLASS_OBJ:
                        mask_perm = aclentp->a_perm;
                        break;
                    default:
                        break;
                    }
                }
                grp_perm &= mask_perm << 3;
                other_perm &= mask_perm;
                other_perm |= other_orig;

            } else {
                /* minimal permissions */
                grp_perm = 070;
                other_perm = 07;
                for (; aclcnt > 0; aclcnt--, aclentp++) {
                    switch (aclentp->a_type) {
                    case USER_OBJ:
                        break;
                    case USER:
                    case CLASS_OBJ:
                        grp_perm &=
                            aclentp->a_perm << 3;
                        other_perm &=
                            aclentp->a_perm;
                        break;
                    case GROUP_OBJ:
                        grp_perm &=
                            aclentp->a_perm << 3;
                        break;
                    case GROUP:
                        other_perm &=
                            aclentp->a_perm;
                        break;
                    case OTHER_OBJ:
                        other_perm &=
                            aclentp->a_perm;
                        break;
                    default:
                        break;
                    }
                }
            }
            /* copy to va */
            va->va_mode &= ~077;
            va->va_mode |= grp_perm | other_perm;
        }
        if (vsa.vsa_aclcnt)
            kmem_free(vsa.vsa_aclentp,
                vsa.vsa_aclcnt * sizeof (aclent_t));
    }
}

void
rfs_srvrinit(void)
{
    mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
    nfs2_srv_caller_id = fs_new_caller_id();
}

void
rfs_srvrfini(void)
{
    mutex_destroy(&rfs_async_write_lock);
}

static int
rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
{
    struct clist    *wcl;
    int     wlist_len;
    uint32_t    count = rr->rr_count;

    wcl = ra->ra_wlist;

    if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
        return (FALSE);
    }

    wcl = ra->ra_wlist;
    rr->rr_ok.rrok_wlist_len = wlist_len;
    rr->rr_ok.rrok_wlist = wcl;

    return (TRUE);
}