fs/nfs/nfs4_client_state.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */


#include <nfs/nfs4_clnt.h>
#include <nfs/rnode4.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/atomic.h>

static void nfs4_free_open_owner(nfs4_open_owner_t *, mntinfo4_t *);
static nfs4_open_owner_t *find_freed_open_owner(cred_t *,
                nfs4_oo_hash_bucket_t *, mntinfo4_t *);
static open_delegation_type4 get_dtype(rnode4_t *);

#ifdef DEBUG
int nfs4_client_foo_debug = 0x0;
int nfs4_client_open_dg = 0x0;
/*
 * If this is non-zero, the lockowner and openowner seqid sync primitives
 * will intermittently return errors.
 */
static int seqid_sync_faults = 0;
#endif

stateid4 clnt_special0 = {
    0,
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
};

stateid4 clnt_special1 = {
    0xffffffff,
    {
        (char)0xff, (char)0xff, (char)0xff, (char)0xff,
        (char)0xff, (char)0xff, (char)0xff, (char)0xff,
        (char)0xff, (char)0xff, (char)0xff, (char)0xff
    }
};

/* finds hash bucket and locks it */
static nfs4_oo_hash_bucket_t *
lock_bucket(cred_t *cr, mntinfo4_t *mi)
{
    nfs4_oo_hash_bucket_t *bucketp;
    uint32_t hash_key;

    hash_key = (uint32_t)(crgetuid(cr) + crgetruid(cr))
        % NFS4_NUM_OO_BUCKETS;
    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "lock_bucket: "
        "hash_key %d for cred %p", hash_key, (void*)cr));

    ASSERT(hash_key >= 0 && hash_key < NFS4_NUM_OO_BUCKETS);
    ASSERT(mi != NULL);
    ASSERT(mutex_owned(&mi->mi_lock));

    bucketp = &(mi->mi_oo_list[hash_key]);
    mutex_enter(&bucketp->b_lock);
    return (bucketp);
}

/* unlocks hash bucket pointed by bucket_ptr */
static void
unlock_bucket(nfs4_oo_hash_bucket_t *bucketp)
{
    mutex_exit(&bucketp->b_lock);
}

/*
 * Removes the lock owner from the rnode's lock_owners list and frees the
 * corresponding reference.
 */
void
nfs4_rnode_remove_lock_owner(rnode4_t *rp, nfs4_lock_owner_t *lop)
{
    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "nfs4_rnode_remove_lock_owner"));

    mutex_enter(&rp->r_statev4_lock);

    if (lop->lo_next_rnode == NULL) {
        /* already removed from list */
        mutex_exit(&rp->r_statev4_lock);
        return;
    }

    ASSERT(lop->lo_prev_rnode != NULL);

    lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
    lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;

    lop->lo_next_rnode = lop->lo_prev_rnode = NULL;

    mutex_exit(&rp->r_statev4_lock);

    /*
     * This would be an appropriate place for
     * RELEASE_LOCKOWNER.  For now, this is overkill
     * because in the common case, close is going to
     * release any lockowners anyway.
     */
    lock_owner_rele(lop);
}

/*
 * Remove all lock owners from the rnode's lock_owners list.  Frees up
 * their references from the list.
 */

void
nfs4_flush_lock_owners(rnode4_t *rp)
{
    nfs4_lock_owner_t *lop;

    mutex_enter(&rp->r_statev4_lock);
    while (rp->r_lo_head.lo_next_rnode != &rp->r_lo_head) {
        lop = rp->r_lo_head.lo_next_rnode;
        lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
        lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
        lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
        lock_owner_rele(lop);
    }
    mutex_exit(&rp->r_statev4_lock);
}

void
nfs4_clear_open_streams(rnode4_t *rp)
{
    nfs4_open_stream_t *osp;

    mutex_enter(&rp->r_os_lock);
    while ((osp = list_head(&rp->r_open_streams)) != NULL) {
        open_owner_rele(osp->os_open_owner);
        list_remove(&rp->r_open_streams, osp);
        mutex_destroy(&osp->os_sync_lock);
        osp->os_open_owner = NULL;
        kmem_free(osp, sizeof (*osp));
    }
    mutex_exit(&rp->r_os_lock);
}

void
open_owner_hold(nfs4_open_owner_t *oop)
{
    mutex_enter(&oop->oo_lock);
    oop->oo_ref_count++;
    mutex_exit(&oop->oo_lock);
}

/*
 * Frees the open owner if the ref count hits zero.
 */
void
open_owner_rele(nfs4_open_owner_t *oop)
{
    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "open_owner_rele"));

    mutex_enter(&oop->oo_lock);
    oop->oo_ref_count--;
    if (oop->oo_ref_count == 0) {
        NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
            "open_owner_rele: freeing open owner"));
        oop->oo_valid = 0;
        mutex_exit(&oop->oo_lock);
        /*
         * Ok, we don't destroy the open owner, nor do we put it on
         * the mntinfo4's free list just yet.  We are lazy about it
         * and let callers to find_open_owner() do that to keep locking
         * simple.
         */
    } else {
        mutex_exit(&oop->oo_lock);
    }
}

void
open_stream_hold(nfs4_open_stream_t *osp)
{
    mutex_enter(&osp->os_sync_lock);
    osp->os_ref_count++;
    mutex_exit(&osp->os_sync_lock);
}

/*
 * Frees the open stream and removes it from the rnode4's open streams list if
 * the ref count drops to zero.
 */
void
open_stream_rele(nfs4_open_stream_t *osp, rnode4_t *rp)
{
    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "open_stream_rele"));

    ASSERT(!mutex_owned(&rp->r_os_lock));

    mutex_enter(&osp->os_sync_lock);
    ASSERT(osp->os_ref_count > 0);
    osp->os_ref_count--;
    if (osp->os_ref_count == 0) {
        nfs4_open_owner_t *tmp_oop;

        NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
            "open_stream_rele: freeing open stream"));
        osp->os_valid = 0;
        tmp_oop = osp->os_open_owner;
        mutex_exit(&osp->os_sync_lock);

        /* now see if we need to destroy the open owner */
        open_owner_rele(tmp_oop);

        mutex_enter(&rp->r_os_lock);
        list_remove(&rp->r_open_streams, osp);
        mutex_exit(&rp->r_os_lock);

        /* free up osp */
        mutex_destroy(&osp->os_sync_lock);
        osp->os_open_owner = NULL;
        kmem_free(osp, sizeof (*osp));
    } else {
        mutex_exit(&osp->os_sync_lock);
    }
}

void
lock_owner_hold(nfs4_lock_owner_t *lop)
{
    mutex_enter(&lop->lo_lock);
    lop->lo_ref_count++;
    mutex_exit(&lop->lo_lock);
}

/*
 * Frees the lock owner if the ref count hits zero and
 * the structure no longer has no locks.
 */
void
lock_owner_rele(nfs4_lock_owner_t *lop)
{
    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "lock_owner_rele"));

    mutex_enter(&lop->lo_lock);
    lop->lo_ref_count--;
    if (lop->lo_ref_count == 0) {
        NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
            "lock_owner_rele: freeing lock owner: "
            "%x", lop->lo_pid));
        lop->lo_valid = 0;
        /*
         * If there are no references, the lock_owner should
         * already be off the rnode's list.
         */
        ASSERT(lop->lo_next_rnode == NULL);
        ASSERT(lop->lo_prev_rnode == NULL);
        ASSERT(!(lop->lo_flags & NFS4_LOCK_SEQID_INUSE));
        ASSERT(lop->lo_seqid_holder == NULL);
        mutex_exit(&lop->lo_lock);

        /* free up lop */
        cv_destroy(&lop->lo_cv_seqid_sync);
        mutex_destroy(&lop->lo_lock);
        kmem_free(lop, sizeof (*lop));
    } else {
        mutex_exit(&lop->lo_lock);
    }
}

/*
 * This increments the open owner ref count if found.
 * The argument 'just_created' determines whether we are looking for open
 * owners with the 'oo_just_created' flag set or not.
 */
nfs4_open_owner_t *
find_open_owner_nolock(cred_t *cr, int just_created, mntinfo4_t *mi)
{
    nfs4_open_owner_t   *oop = NULL, *next_oop;
    nfs4_oo_hash_bucket_t   *bucketp;

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "find_open_owner: cred %p, just_created %d",
        (void*)cr, just_created));

    ASSERT(mi != NULL);
    ASSERT(mutex_owned(&mi->mi_lock));

    bucketp = lock_bucket(cr, mi);

    /* got hash bucket, search through open owners */
    for (oop = list_head(&bucketp->b_oo_hash_list); oop != NULL; ) {
        mutex_enter(&oop->oo_lock);
        if (!crcmp(oop->oo_cred, cr) &&
            (oop->oo_just_created == just_created ||
            just_created == NFS4_JUST_CREATED)) {
            /* match */
            if (oop->oo_valid == 0) {
                /* reactivate the open owner */
                oop->oo_valid = 1;
                ASSERT(oop->oo_ref_count == 0);
            }
            oop->oo_ref_count++;
            mutex_exit(&oop->oo_lock);
            unlock_bucket(bucketp);
            return (oop);
        }
        next_oop = list_next(&bucketp->b_oo_hash_list, oop);
        if (oop->oo_valid == 0) {
            list_remove(&bucketp->b_oo_hash_list, oop);

            /*
             * Now we go ahead and put this open owner
             * on the freed list.  This is our lazy method.
             */
            nfs4_free_open_owner(oop, mi);
        }

        mutex_exit(&oop->oo_lock);
        oop = next_oop;
    }

    /* search through recently freed open owners */
    oop = find_freed_open_owner(cr, bucketp, mi);

    unlock_bucket(bucketp);

    return (oop);
}

nfs4_open_owner_t *
find_open_owner(cred_t *cr, int just_created, mntinfo4_t *mi)
{
    nfs4_open_owner_t *oop;

    mutex_enter(&mi->mi_lock);
    oop = find_open_owner_nolock(cr, just_created, mi);
    mutex_exit(&mi->mi_lock);

    return (oop);
}

/*
 * This increments osp's ref count if found.
 * Returns with 'os_sync_lock' held.
 */
nfs4_open_stream_t *
find_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
{
    nfs4_open_stream_t  *osp;

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "find_open_stream"));

    mutex_enter(&rp->r_os_lock);
    /* Now, no one can add or delete to rp's open streams list */
    for (osp = list_head(&rp->r_open_streams); osp != NULL;
        osp = list_next(&rp->r_open_streams, osp)) {
        mutex_enter(&osp->os_sync_lock);
        if (osp->os_open_owner == oop && osp->os_valid != 0) {
            /* match */
            NFS4_DEBUG(nfs4_client_state_debug,
                (CE_NOTE, "find_open_stream "
                "got a match"));

            osp->os_ref_count++;
            mutex_exit(&rp->r_os_lock);
            return (osp);
        }
        mutex_exit(&osp->os_sync_lock);
    }

    mutex_exit(&rp->r_os_lock);
    return (NULL);
}

/*
 * Find the lock owner for the given file and process ID.  If "which" is
 * LOWN_VALID_STATEID, require that the lock owner contain a valid stateid
 * from the server.
 *
 * This increments the lock owner's ref count if found.  Returns NULL if
 * there was no match.
 */
nfs4_lock_owner_t *
find_lock_owner(rnode4_t *rp, pid_t pid, lown_which_t which)
{
    nfs4_lock_owner_t   *lop, *next_lop;

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "find_lock_owner: pid %x, which %d", pid, which));

    ASSERT(which == LOWN_ANY || which == LOWN_VALID_STATEID);

    /* search by pid */
    mutex_enter(&rp->r_statev4_lock);

    lop = rp->r_lo_head.lo_next_rnode;
    while (lop != &rp->r_lo_head) {
        mutex_enter(&lop->lo_lock);
        if (lop->lo_pid == pid && lop->lo_valid != 0 &&
            !(lop->lo_flags & NFS4_BAD_SEQID_LOCK)) {
            if (which == LOWN_ANY ||
                lop->lo_just_created != NFS4_JUST_CREATED) {
                /* Found a matching lock owner */
                NFS4_DEBUG(nfs4_client_state_debug,
                    (CE_NOTE, "find_lock_owner: "
                    "got a match"));

                lop->lo_ref_count++;
                mutex_exit(&lop->lo_lock);
                mutex_exit(&rp->r_statev4_lock);
                return (lop);
            }
        }
        next_lop = lop->lo_next_rnode;
        mutex_exit(&lop->lo_lock);
        lop = next_lop;
    }

    mutex_exit(&rp->r_statev4_lock);
    return (NULL);
}

/*
 * This returns the delegation stateid as 'sid'. Returns 1 if a successful
 * delegation stateid was found, otherwise returns 0.
 */

static int
nfs4_get_deleg_stateid(rnode4_t *rp, nfs_opnum4 op, stateid4 *sid)
{
    ASSERT(!mutex_owned(&rp->r_statev4_lock));

    mutex_enter(&rp->r_statev4_lock);
    if (((rp->r_deleg_type == OPEN_DELEGATE_WRITE && op == OP_WRITE) ||
        (rp->r_deleg_type != OPEN_DELEGATE_NONE && op != OP_WRITE)) &&
        !rp->r_deleg_return_pending) {

        *sid = rp->r_deleg_stateid;
        mutex_exit(&rp->r_statev4_lock);
        return (1);
    }
    mutex_exit(&rp->r_statev4_lock);
    return (0);
}

/*
 * This returns the lock stateid as 'sid'. Returns 1 if a successful lock
 * stateid was found, otherwise returns 0.
 */
static int
nfs4_get_lock_stateid(rnode4_t *rp, pid_t pid, stateid4 *sid)
{
    nfs4_lock_owner_t *lop;

    lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);

    if (lop) {
        /*
         * Found a matching lock owner, so use a lock
         * stateid rather than an open stateid.
         */
        mutex_enter(&lop->lo_lock);
        *sid = lop->lock_stateid;
        mutex_exit(&lop->lo_lock);
        lock_owner_rele(lop);
        return (1);
    }

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "nfs4_get_lock_stateid: no lop"));
    return (0);
}

/*
 * This returns the open stateid as 'sid'. Returns 1 if a successful open
 * stateid was found, otherwise returns 0.
 *
 * Once the stateid is returned to the caller, it is no longer protected;
 * so the caller must be prepared to handle OLD/BAD_STATEID where
 * appropiate.
 */
static int
nfs4_get_open_stateid(rnode4_t *rp, cred_t *cr, mntinfo4_t *mi, stateid4 *sid)
{
    nfs4_open_owner_t *oop;
    nfs4_open_stream_t *osp;

    ASSERT(mi != NULL);

    oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
    if (!oop) {
        NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
            "nfs4_get_open_stateid: no oop"));
        return (0);
    }

    osp = find_open_stream(oop, rp);
    open_owner_rele(oop);
    if (!osp) {
        NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
            "nfs4_get_open_stateid: no osp"));
        return (0);
    }

    if (osp->os_failed_reopen) {
        NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
            "nfs4_get_open_stateid: osp %p failed reopen",
            (void *)osp));
        mutex_exit(&osp->os_sync_lock);
        open_stream_rele(osp, rp);
        return (0);
    }
    *sid = osp->open_stateid;
    mutex_exit(&osp->os_sync_lock);
    open_stream_rele(osp, rp);
    return (1);
}

/*
 * Returns the delegation stateid if this 'op' is OP_WRITE and the
 * delegation we hold is a write delegation, OR this 'op' is not
 * OP_WRITE and we have a delegation held (read or write), otherwise
 * returns the lock stateid if there is a lock owner, otherwise
 * returns the open stateid if there is a open stream, otherwise
 * returns special stateid <seqid = 0, other = 0>.
 *
 * Used for WRITE operations.
 */
stateid4
nfs4_get_w_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
    nfs_opnum4 op, nfs4_stateid_types_t *sid_tp)
{
    stateid4 sid;

    if (nfs4_get_deleg_stateid(rp, op, &sid)) {
        if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
            sid_tp->cur_sid_type = DEL_SID;
            return (sid);
        }
    }
    if (nfs4_get_lock_stateid(rp, pid, &sid)) {
        if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
            sid_tp->cur_sid_type = LOCK_SID;
            return (sid);
        }
    }
    if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
        if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
            sid_tp->cur_sid_type = OPEN_SID;
            return (sid);
        }
    }
    bzero(&sid, sizeof (stateid4));
    sid_tp->cur_sid_type = SPEC_SID;
    return (sid);
}

/*
 * Returns the delegation stateid if this 'op' is OP_WRITE and the
 * delegation we hold is a write delegation, OR this 'op' is not
 * OP_WRITE and we have a delegation held (read or write), otherwise
 * returns the lock stateid if there is a lock owner, otherwise
 * returns the open stateid if there is a open stream, otherwise
 * returns special stateid <seqid = 0, other = 0>.
 *
 * This also updates which stateid we are using in 'sid_tp', skips
 * previously attempted stateids, and skips checking higher priority
 * stateids than the current level as dictated by 'sid_tp->cur_sid_type'
 * for async reads.
 *
 * Used for READ and SETATTR operations.
 */
stateid4
nfs4_get_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
    nfs_opnum4 op, nfs4_stateid_types_t *sid_tp, bool_t async_read)
{
    stateid4 sid;

    /*
     * For asynchronous READs, do not attempt to retry from the start of
     * the stateid priority list, just continue from where you last left
     * off.
     */
    if (async_read) {
        switch (sid_tp->cur_sid_type) {
        case NO_SID:
            break;
        case DEL_SID:
            goto lock_stateid;
        case LOCK_SID:
            goto open_stateid;
        case OPEN_SID:
            goto special_stateid;
        case SPEC_SID:
        default:
            cmn_err(CE_PANIC, "nfs4_get_stateid: illegal current "
                "stateid type %d", sid_tp->cur_sid_type);
        }
    }

    if (nfs4_get_deleg_stateid(rp, op, &sid)) {
        if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
            sid_tp->cur_sid_type = DEL_SID;
            return (sid);
        }
    }
lock_stateid:
    if (nfs4_get_lock_stateid(rp, pid, &sid)) {
        if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
            sid_tp->cur_sid_type = LOCK_SID;
            return (sid);
        }
    }
open_stateid:
    if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
        if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
            sid_tp->cur_sid_type = OPEN_SID;
            return (sid);
        }
    }
special_stateid:
    bzero(&sid, sizeof (stateid4));
    sid_tp->cur_sid_type = SPEC_SID;
    return  (sid);
}

void
nfs4_set_lock_stateid(nfs4_lock_owner_t *lop, stateid4 stateid)
{
    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "nfs4_set_lock_stateid"));

    ASSERT(lop);
    ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);

    mutex_enter(&lop->lo_lock);
    lop->lock_stateid = stateid;
    mutex_exit(&lop->lo_lock);
}

/*
 * Sequence number used when a new open owner is needed.
 * This is used so as to not confuse the server.  Since a open owner
 * is based off of cred, a cred could be re-used quickly, and the server
 * may not release all state for a cred.
 */
static uint64_t open_owner_seq_num = 0;

uint64_t
nfs4_get_new_oo_name(void)
{
    return (atomic_inc_64_nv(&open_owner_seq_num));
}

/*
 * Create a new open owner and add it to the open owner hash table.
 */
nfs4_open_owner_t *
create_open_owner(cred_t *cr, mntinfo4_t *mi)
{
    nfs4_open_owner_t   *oop;
    nfs4_oo_hash_bucket_t   *bucketp;

    oop = kmem_alloc(sizeof (nfs4_open_owner_t), KM_SLEEP);
    /*
     * Make sure the cred doesn't go away when we put this open owner
     * on the free list, as well as make crcmp() a valid check.
     */
    crhold(cr);
    oop->oo_cred = cr;
    mutex_init(&oop->oo_lock, NULL, MUTEX_DEFAULT, NULL);
    oop->oo_ref_count = 1;
    oop->oo_valid = 1;
    oop->oo_just_created = NFS4_JUST_CREATED;
    oop->oo_seqid = 0;
    oop->oo_seqid_inuse = 0;
    oop->oo_last_good_seqid = 0;
    oop->oo_last_good_op = TAG_NONE;
    oop->oo_cred_otw = NULL;
    cv_init(&oop->oo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);

    /*
     * A Solaris open_owner is <oo_seq_num>
     */
    oop->oo_name = nfs4_get_new_oo_name();

    /* now add the struct into the cred hash table */
    ASSERT(mutex_owned(&mi->mi_lock));
    bucketp = lock_bucket(cr, mi);
    list_insert_head(&bucketp->b_oo_hash_list, oop);
    unlock_bucket(bucketp);

    return (oop);
}

/*
 * Create a new open stream and it to the rnode's list.
 * Increments the ref count on oop.
 * Returns with 'os_sync_lock' held.
 */
nfs4_open_stream_t *
create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
{
    nfs4_open_stream_t  *osp;

#ifdef DEBUG
    mutex_enter(&oop->oo_lock);
    ASSERT(oop->oo_seqid_inuse);
    mutex_exit(&oop->oo_lock);
#endif

    osp = kmem_alloc(sizeof (nfs4_open_stream_t), KM_SLEEP);
    osp->os_open_ref_count = 1;
    osp->os_mapcnt = 0;
    osp->os_ref_count = 2;
    osp->os_valid = 1;
    osp->os_open_owner = oop;
    osp->os_orig_oo_name = oop->oo_name;
    bzero(&osp->open_stateid, sizeof (stateid4));
    osp->os_share_acc_read = 0;
    osp->os_share_acc_write = 0;
    osp->os_mmap_read = 0;
    osp->os_mmap_write = 0;
    osp->os_share_deny_none = 0;
    osp->os_share_deny_read = 0;
    osp->os_share_deny_write = 0;
    osp->os_delegation = 0;
    osp->os_dc_openacc = 0;
    osp->os_final_close = 0;
    osp->os_pending_close = 0;
    osp->os_failed_reopen = 0;
    osp->os_force_close = 0;
    mutex_init(&osp->os_sync_lock, NULL, MUTEX_DEFAULT, NULL);

    /* open owner gets a reference */
    open_owner_hold(oop);

    /* now add the open stream to rp */
    mutex_enter(&rp->r_os_lock);
    mutex_enter(&osp->os_sync_lock);
    list_insert_head(&rp->r_open_streams, osp);
    mutex_exit(&rp->r_os_lock);

    return (osp);
}

/*
 * Returns an open stream with 'os_sync_lock' held.
 * If the open stream is found (rather than created), its
 * 'os_open_ref_count' is bumped.
 *
 * There is no race with two threads entering this function
 * and creating two open streams for the same <oop, rp> pair.
 * This is because the open seqid sync must be acquired, thus
 * only allowing one thread in at a time.
 */
nfs4_open_stream_t *
find_or_create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp,
    int *created_osp)
{
    nfs4_open_stream_t *osp;

#ifdef DEBUG
    mutex_enter(&oop->oo_lock);
    ASSERT(oop->oo_seqid_inuse);
    mutex_exit(&oop->oo_lock);
#endif

    osp = find_open_stream(oop, rp);
    if (!osp) {
        osp = create_open_stream(oop, rp);
        if (osp)
            *created_osp = 1;
    } else {
        *created_osp = 0;
        osp->os_open_ref_count++;
    }

    return (osp);
}

static uint64_t lock_owner_seq_num = 0;

/*
 * Create a new lock owner and add it to the rnode's list.
 * Assumes the rnode's r_statev4_lock is held.
 * The created lock owner has a reference count of 2: one for the list and
 * one for the caller to use.  Returns the lock owner locked down.
 */
nfs4_lock_owner_t *
create_lock_owner(rnode4_t *rp, pid_t pid)
{
    nfs4_lock_owner_t   *lop;

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "create_lock_owner: pid %x", pid));

    ASSERT(mutex_owned(&rp->r_statev4_lock));

    lop = kmem_alloc(sizeof (nfs4_lock_owner_t), KM_SLEEP);
    lop->lo_ref_count = 2;
    lop->lo_valid = 1;
    bzero(&lop->lock_stateid, sizeof (stateid4));
    lop->lo_pid = pid;
    lop->lock_seqid = 0;
    lop->lo_pending_rqsts = 0;
    lop->lo_just_created = NFS4_JUST_CREATED;
    lop->lo_flags = 0;
    lop->lo_seqid_holder = NULL;

    /*
     * A Solaris lock_owner is <seq_num><pid>
     */
    lop->lock_owner_name.ln_seq_num =
        atomic_inc_64_nv(&lock_owner_seq_num);
    lop->lock_owner_name.ln_pid = pid;

    cv_init(&lop->lo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
    mutex_init(&lop->lo_lock, NULL, MUTEX_DEFAULT, NULL);

    mutex_enter(&lop->lo_lock);

    /* now add the lock owner to rp */
    lop->lo_prev_rnode = &rp->r_lo_head;
    lop->lo_next_rnode = rp->r_lo_head.lo_next_rnode;
    rp->r_lo_head.lo_next_rnode->lo_prev_rnode = lop;
    rp->r_lo_head.lo_next_rnode = lop;

    return (lop);

}

/*
 * This sets the lock seqid of a lock owner.
 */
void
nfs4_set_lock_seqid(seqid4 seqid, nfs4_lock_owner_t *lop)
{
    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "nfs4_set_lock_seqid"));

    ASSERT(lop != NULL);
    ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);

    lop->lock_seqid = seqid;
}

static void
nfs4_set_new_lock_owner_args(lock_owner4 *owner, pid_t pid)
{
    nfs4_lo_name_t *cast_namep;

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "nfs4_set_new_lock_owner_args"));

    owner->owner_len = sizeof (*cast_namep);
    owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
    /*
     * A Solaris lock_owner is <seq_num><pid>
     */
    cast_namep = (nfs4_lo_name_t *)owner->owner_val;
    cast_namep->ln_seq_num = atomic_inc_64_nv(&lock_owner_seq_num);
    cast_namep->ln_pid = pid;
}

/*
 * Fill in the lock owner args.
 */
void
nfs4_setlockowner_args(lock_owner4 *owner, rnode4_t *rp, pid_t pid)
{
    nfs4_lock_owner_t *lop;

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "nfs4_setlockowner_args"));

    /* This increments lop's ref count */
    lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);

    if (!lop)
        goto make_up_args;

    mutex_enter(&lop->lo_lock);
    owner->owner_len = sizeof (lop->lock_owner_name);
    owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
    bcopy(&lop->lock_owner_name, owner->owner_val,
        owner->owner_len);
    mutex_exit(&lop->lo_lock);
    lock_owner_rele(lop);
    return;

make_up_args:
    nfs4_set_new_lock_owner_args(owner, pid);
}

/*
 * This ends our use of the open owner's open seqid by setting
 * the appropiate flags and issuing a cv_signal to wake up another
 * thread waiting to use the open seqid.
 */

void
nfs4_end_open_seqid_sync(nfs4_open_owner_t *oop)
{
    mutex_enter(&oop->oo_lock);
    ASSERT(oop->oo_seqid_inuse);
    oop->oo_seqid_inuse = 0;
    cv_broadcast(&oop->oo_cv_seqid_sync);
    mutex_exit(&oop->oo_lock);
}

/*
 * This starts our use of the open owner's open seqid by setting
 * the oo_seqid_inuse to true.  We will wait (forever) with a
 * cv_wait() until we are woken up.
 *
 * Return values:
 * 0        no problems
 * EAGAIN   caller should retry (like a recovery retry)
 */
int
nfs4_start_open_seqid_sync(nfs4_open_owner_t *oop, mntinfo4_t *mi)
{
    int error = 0;
#ifdef DEBUG
    static int ops = 0;     /* fault injection */
#endif

#ifdef DEBUG
    if (seqid_sync_faults && curthread != mi->mi_recovthread &&
        ++ops % 5 == 0)
        return (EAGAIN);
#endif

    mutex_enter(&mi->mi_lock);
    if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
        curthread != mi->mi_recovthread)
        error = EAGAIN;
    mutex_exit(&mi->mi_lock);
    if (error != 0)
        goto done;

    mutex_enter(&oop->oo_lock);

    while (oop->oo_seqid_inuse) {
        NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
            "nfs4_start_open_seqid_sync waiting on cv"));

        cv_wait(&oop->oo_cv_seqid_sync, &oop->oo_lock);
    }

    oop->oo_seqid_inuse = 1;

    mutex_exit(&oop->oo_lock);

    mutex_enter(&mi->mi_lock);
    if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
        curthread != mi->mi_recovthread)
        error = EAGAIN;
    mutex_exit(&mi->mi_lock);

    if (error == EAGAIN)
        nfs4_end_open_seqid_sync(oop);

    NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
        "nfs4_start_open_seqid_sync: error=%d", error));

done:
    return (error);
}

#ifdef  DEBUG
int bypass_otw[2];
#endif

/*
 * Checks to see if the OPEN OTW is necessary that is, if it's already
 * been opened with the same access and deny bits we are now asking for.
 * Note, this assumes that *vpp is a rnode.
 */
int
nfs4_is_otw_open_necessary(nfs4_open_owner_t *oop, int flag, vnode_t *vp,
    int just_been_created, int *errorp, int acc, nfs4_recov_state_t *rsp)
{
    rnode4_t *rp;
    nfs4_open_stream_t *osp;
    open_delegation_type4 dt;

    rp = VTOR4(vp);

    /*
     * Grab the delegation type.  This function is protected against
     * the delegation being returned by virtue of start_op (called
     * by nfs4open_otw) taking the r_deleg_recall_lock in read mode,
     * delegreturn requires this lock in write mode to proceed.
     */
    ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_READER));
    dt = get_dtype(rp);

    /* returns with 'os_sync_lock' held */
    osp = find_open_stream(oop, rp);

    if (osp) {
        uint32_t    do_otw = 0;

        if (osp->os_failed_reopen) {
            NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
                "nfs4_is_otw_open_necessary: os_failed_reopen "
                "set on osp %p, cr %p, rp %s", (void *)osp,
                (void *)osp->os_open_owner->oo_cred,
                rnode4info(rp)));
            do_otw = 1;
        }

        /*
         * check access/deny bits
         */
        if (!do_otw && (flag & FREAD))
            if (osp->os_share_acc_read == 0 &&
                dt == OPEN_DELEGATE_NONE)
                do_otw = 1;

        if (!do_otw && (flag & FWRITE))
            if (osp->os_share_acc_write == 0 &&
                dt != OPEN_DELEGATE_WRITE)
                do_otw = 1;

        if (!do_otw) {
            NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
                "nfs4_is_otw_open_necessary: can skip this "
                "open OTW"));
            if (!just_been_created) {
                osp->os_open_ref_count++;
                if (flag & FREAD)
                    osp->os_share_acc_read++;
                if (flag & FWRITE)
                    osp->os_share_acc_write++;
                osp->os_share_deny_none++;
            }

            /*
             * Need to reset this bitfield for the possible case
             * where we were going to OTW CLOSE the file, got a
             * non-recoverable error, and before we could retry
             * the CLOSE, OPENed the file again.
             */
            ASSERT(osp->os_open_owner->oo_seqid_inuse);
            osp->os_final_close = 0;
            osp->os_force_close = 0;

            mutex_exit(&osp->os_sync_lock);
            open_stream_rele(osp, rp);

#ifdef  DEBUG
            bypass_otw[0]++;
#endif

            *errorp = 0;
            return (0);
        }
        mutex_exit(&osp->os_sync_lock);
        open_stream_rele(osp, rp);

    } else if (dt != OPEN_DELEGATE_NONE) {
        /*
         * Even if there isn't an open_stream yet, we may still be
         * able to bypass the otw open if the client owns a delegation.
         *
         * If you are asking for for WRITE, but I only have
         * a read delegation, then you still have to go otw.
         */

        if (flag & FWRITE && dt == OPEN_DELEGATE_READ)
            return (1);

        /*
         * TODO - evaluate the nfsace4
         */

        /*
         * Check the access flags to make sure the caller
         * had permission.
         */
        if (flag & FREAD && !(acc & VREAD))
            return (1);

        if (flag & FWRITE && !(acc & VWRITE))
            return (1);

        /*
         * create_open_stream will add a reference to oop,
         * this will prevent the open_owner_rele done in
         * nfs4open_otw from destroying the open_owner.
         */

        /* returns with 'os_sync_lock' held */
        osp = create_open_stream(oop, rp);
        if (osp == NULL)
            return (1);

        osp->open_stateid = rp->r_deleg_stateid;
        osp->os_delegation = 1;

        if (flag & FREAD)
            osp->os_share_acc_read++;
        if (flag & FWRITE)
            osp->os_share_acc_write++;

        osp->os_share_deny_none++;
        mutex_exit(&osp->os_sync_lock);

        open_stream_rele(osp, rp);

        mutex_enter(&oop->oo_lock);
        oop->oo_just_created = NFS4_PERM_CREATED;
        mutex_exit(&oop->oo_lock);

        ASSERT(rsp != NULL);
        if (rsp->rs_sp != NULL) {
            mutex_enter(&rsp->rs_sp->s_lock);
            nfs4_inc_state_ref_count_nolock(rsp->rs_sp,
                VTOMI4(vp));
            mutex_exit(&rsp->rs_sp->s_lock);
        }
#ifdef  DEBUG
        bypass_otw[1]++;
#endif

        *errorp = 0;
        return (0);
    }

    return (1);
}

static open_delegation_type4
get_dtype(rnode4_t *rp)
{
    open_delegation_type4 dt;

    mutex_enter(&rp->r_statev4_lock);
    ASSERT(!rp->r_deleg_return_inprog);
    if (rp->r_deleg_return_pending)
        dt = OPEN_DELEGATE_NONE;
    else
        dt = rp->r_deleg_type;
    mutex_exit(&rp->r_statev4_lock);

    return (dt);
}

/*
 * Fill in *locker with the lock state arguments for a LOCK call.  If
 * lop->lo_just_created == NFS4_JUST_CREATED, oop and osp must be non-NULL.
 * Caller must already hold the necessary seqid sync lock(s).
 */

void
nfs4_setup_lock_args(nfs4_lock_owner_t *lop, nfs4_open_owner_t *oop,
    nfs4_open_stream_t *osp, clientid4 clientid, locker4 *locker)
{
    ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
    if (lop->lo_just_created == NFS4_JUST_CREATED) {
        /* this is a new lock request */
        open_to_lock_owner4 *nown;

        ASSERT(oop != NULL);
        ASSERT(osp != NULL);

        locker->new_lock_owner = TRUE;
        nown = &locker->locker4_u.open_owner;
        nown->open_seqid = nfs4_get_open_seqid(oop) + 1;
        mutex_enter(&osp->os_sync_lock);
        nown->open_stateid = osp->open_stateid;
        mutex_exit(&osp->os_sync_lock);
        nown->lock_seqid = lop->lock_seqid; /* initial, so no +1 */

        nown->lock_owner.clientid = clientid;
        nown->lock_owner.owner_len = sizeof (lop->lock_owner_name);
        nown->lock_owner.owner_val =
            kmem_alloc(nown->lock_owner.owner_len, KM_SLEEP);
        bcopy(&lop->lock_owner_name, nown->lock_owner.owner_val,
            nown->lock_owner.owner_len);
    } else {
        exist_lock_owner4 *eown;
        /* have an existing lock owner */

        locker->new_lock_owner = FALSE;
        eown = &locker->locker4_u.lock_owner;
        mutex_enter(&lop->lo_lock);
        eown->lock_stateid = lop->lock_stateid;
        mutex_exit(&lop->lo_lock);
        eown->lock_seqid = lop->lock_seqid + 1;
    }
}

/*
 * This starts our use of the lock owner's lock seqid by setting
 * the lo_flags to NFS4_LOCK_SEQID_INUSE.  We will wait (forever)
 * with a cv_wait() until we are woken up.
 *
 * Return values:
 * 0        no problems
 * EAGAIN   caller should retry (like a recovery retry)
 */
int
nfs4_start_lock_seqid_sync(nfs4_lock_owner_t *lop, mntinfo4_t *mi)
{
    int error = 0;
#ifdef DEBUG
    static int ops = 0;     /* fault injection */
#endif

#ifdef DEBUG
    if (seqid_sync_faults && curthread != mi->mi_recovthread &&
        ++ops % 7 == 0)
        return (EAGAIN);
#endif

    mutex_enter(&mi->mi_lock);
    if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
        curthread != mi->mi_recovthread)
        error = EAGAIN;
    mutex_exit(&mi->mi_lock);
    if (error != 0)
        goto done;

    mutex_enter(&lop->lo_lock);

    ASSERT(lop->lo_seqid_holder != curthread);
    while (lop->lo_flags & NFS4_LOCK_SEQID_INUSE) {
        NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
            "nfs4_start_lock_seqid_sync: waiting on cv"));

        cv_wait(&lop->lo_cv_seqid_sync, &lop->lo_lock);
    }
    NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4_start_lock_seqid_sync: "
        "NFS4_LOCK_SEQID_INUSE"));

    lop->lo_flags |= NFS4_LOCK_SEQID_INUSE;
    lop->lo_seqid_holder = curthread;
    mutex_exit(&lop->lo_lock);

    mutex_enter(&mi->mi_lock);
    if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
        curthread != mi->mi_recovthread)
        error = EAGAIN;
    mutex_exit(&mi->mi_lock);

    if (error == EAGAIN)
        nfs4_end_lock_seqid_sync(lop);

    NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
        "nfs4_start_lock_seqid_sync: error=%d", error));

done:
    return (error);
}

/*
 * This ends our use of the lock owner's lock seqid by setting
 * the appropiate flags and issuing a cv_signal to wake up another
 * thread waiting to use the lock seqid.
 */
void
nfs4_end_lock_seqid_sync(nfs4_lock_owner_t *lop)
{
    mutex_enter(&lop->lo_lock);
    ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
    ASSERT(lop->lo_seqid_holder == curthread);
    lop->lo_flags &= ~NFS4_LOCK_SEQID_INUSE;
    lop->lo_seqid_holder = NULL;
    cv_broadcast(&lop->lo_cv_seqid_sync);
    mutex_exit(&lop->lo_lock);
}

/*
 * Returns a reference to a lock owner via lopp, which has its lock seqid
 * synchronization started.
 * If the lock owner is in the 'just_created' state, then we return its open
 * owner and open stream and start the open seqid synchronization.
 *
 * Return value:
 * NFS4_OK      no problems
 * NFS4ERR_DELAY    there is lost state to recover; caller should retry
 * NFS4ERR_IO       no open stream
 */
nfsstat4
nfs4_find_or_create_lock_owner(pid_t pid, rnode4_t *rp, cred_t *cr,
    nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
    nfs4_lock_owner_t **lopp)
{
    nfs4_lock_owner_t *lop, *next_lop;
    mntinfo4_t *mi;
    int error = 0;
    nfsstat4 stat;

    mi = VTOMI4(RTOV4(rp));

    mutex_enter(&rp->r_statev4_lock);

    lop = rp->r_lo_head.lo_next_rnode;
    while (lop != &rp->r_lo_head) {
        mutex_enter(&lop->lo_lock);
        if (lop->lo_pid == pid && lop->lo_valid != 0) {
            /* Found a matching lock owner */
            NFS4_DEBUG(nfs4_client_state_debug,
                (CE_NOTE, "nfs4_find_or_create_lock_owner: "
                "got a match"));
            lop->lo_ref_count++;
            break;
        }
        next_lop = lop->lo_next_rnode;
        mutex_exit(&lop->lo_lock);
        lop = next_lop;
    }

    if (lop == &rp->r_lo_head) {
        /* create temporary lock owner */
        lop = create_lock_owner(rp, pid);
    }
    mutex_exit(&rp->r_statev4_lock);

    /* Have a locked down lock owner struct now */
    if (lop->lo_just_created != NFS4_JUST_CREATED) {
        /* This is an existing lock owner */
        *oopp = NULL;
        *ospp = NULL;
    } else {
        /* Lock owner doesn't exist yet */

        /* First grab open owner seqid synchronization */
        mutex_exit(&lop->lo_lock);
        *oopp = find_open_owner(cr, NFS4_PERM_CREATED, mi);
        if (*oopp == NULL)
            goto kill_new_lop;
        error = nfs4_start_open_seqid_sync(*oopp, mi);
        if (error == EAGAIN) {
            stat = NFS4ERR_DELAY;
            goto failed;
        }
        *ospp = find_open_stream(*oopp, rp);
        if (*ospp == NULL) {
            nfs4_end_open_seqid_sync(*oopp);
            goto kill_new_lop;
        }
        if ((*ospp)->os_failed_reopen) {
            mutex_exit(&(*ospp)->os_sync_lock);
            NFS4_DEBUG((nfs4_open_stream_debug ||
                nfs4_client_lock_debug), (CE_NOTE,
                "nfs4_find_or_create_lock_owner: os_failed_reopen;"
                "osp %p, cr %p, rp %s", (void *)(*ospp),
                (void *)cr, rnode4info(rp)));
            nfs4_end_open_seqid_sync(*oopp);
            stat = NFS4ERR_IO;
            goto failed;
        }
        mutex_exit(&(*ospp)->os_sync_lock);

        /*
         * Now see if the lock owner has become permanent while we
         * had released our lock.
         */
        mutex_enter(&lop->lo_lock);
        if (lop->lo_just_created != NFS4_JUST_CREATED) {
            nfs4_end_open_seqid_sync(*oopp);
            open_stream_rele(*ospp, rp);
            open_owner_rele(*oopp);
            *oopp = NULL;
            *ospp = NULL;
        }
    }
    mutex_exit(&lop->lo_lock);

    error = nfs4_start_lock_seqid_sync(lop, mi);
    if (error == EAGAIN) {
        if (*oopp != NULL)
            nfs4_end_open_seqid_sync(*oopp);
        stat = NFS4ERR_DELAY;
        goto failed;
    }
    ASSERT(error == 0);

    *lopp = lop;
    return (NFS4_OK);

kill_new_lop:
    /*
     * A previous CLOSE was attempted but got EINTR, but the application
     * continued to use the unspecified state file descriptor.  But now the
     * open stream is gone (which could also destroy the open owner), hence
     * we can no longer continue.  The calling function should return EIO
     * to the application.
     */
    NFS4_DEBUG(nfs4_lost_rqst_debug || nfs4_client_lock_debug,
        (CE_NOTE, "nfs4_find_or_create_lock_owner: destroy newly created "
        "lop %p, oop %p, osp %p", (void *)lop, (void *)(*oopp),
        (void *)(*ospp)));

    nfs4_rnode_remove_lock_owner(rp, lop);
    stat = NFS4ERR_IO;

failed:
    lock_owner_rele(lop);
    if (*oopp) {
        open_owner_rele(*oopp);
        *oopp = NULL;
    }
    if (*ospp) {
        open_stream_rele(*ospp, rp);
        *ospp = NULL;
    }
    return (stat);
}

/*
 * This function grabs a recently freed open owner off of the freed open
 * owner list if there is a match on the cred 'cr'.  It returns NULL if no
 * such match is found.  It will set the 'oo_ref_count' and 'oo_valid' back
 * to both 1 (sane values) in the case a match is found.
 */
static nfs4_open_owner_t *
find_freed_open_owner(cred_t *cr, nfs4_oo_hash_bucket_t *bucketp,
    mntinfo4_t *mi)
{
    nfs4_open_owner_t       *foop;

    NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
        "find_freed_open_owner: cred %p", (void*)cr));

    ASSERT(mutex_owned(&mi->mi_lock));
    ASSERT(mutex_owned(&bucketp->b_lock));

    /* got hash bucket, search through freed open owners */
    for (foop = list_head(&mi->mi_foo_list); foop != NULL;
        foop = list_next(&mi->mi_foo_list, foop)) {
        if (!crcmp(foop->oo_cred, cr)) {
            NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
                "find_freed_open_owner: got a match open owner "
                "%p", (void *)foop));
            foop->oo_ref_count = 1;
            foop->oo_valid = 1;
            list_remove(&mi->mi_foo_list, foop);
            mi->mi_foo_num--;

            /* now add the struct into the cred hash table */
            list_insert_head(&bucketp->b_oo_hash_list, foop);
            return (foop);
        }
    }

    return (NULL);
}

/*
 * Insert the newly freed 'oop' into the mi's freed oop list,
 * always at the head of the list.  If we've already reached
 * our maximum allowed number of freed open owners (mi_foo_max),
 * then remove the LRU open owner on the list (namely the tail).
 */
static void
nfs4_free_open_owner(nfs4_open_owner_t *oop, mntinfo4_t *mi)
{
    nfs4_open_owner_t *lru_foop;

    if (mi->mi_foo_num < mi->mi_foo_max) {
        NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
            "nfs4_free_open_owner: num free %d, max free %d, "
            "insert open owner %p for mntinfo4 %p",
            mi->mi_foo_num, mi->mi_foo_max, (void *)oop,
            (void *)mi));
        list_insert_head(&mi->mi_foo_list, oop);
        mi->mi_foo_num++;
        return;
    }

    /* need to replace a freed open owner */

    lru_foop = list_tail(&mi->mi_foo_list);

    NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
        "nfs4_free_open_owner: destroy %p, insert %p",
        (void *)lru_foop, (void *)oop));

    list_remove(&mi->mi_foo_list, lru_foop);
    nfs4_destroy_open_owner(lru_foop);

    /* head always has latest freed oop */
    list_insert_head(&mi->mi_foo_list, oop);
}

void
nfs4_destroy_open_owner(nfs4_open_owner_t *oop)
{
    ASSERT(oop != NULL);

    crfree(oop->oo_cred);
    if (oop->oo_cred_otw)
        crfree(oop->oo_cred_otw);
    mutex_destroy(&oop->oo_lock);
    cv_destroy(&oop->oo_cv_seqid_sync);
    kmem_free(oop, sizeof (*oop));
}

seqid4
nfs4_get_open_seqid(nfs4_open_owner_t *oop)
{
    ASSERT(oop->oo_seqid_inuse);
    return (oop->oo_seqid);
}

/*
 * This set's the open seqid for a <open owner/ mntinfo4> pair.
 */
void
nfs4_set_open_seqid(seqid4 seqid, nfs4_open_owner_t *oop,
    nfs4_tag_type_t tag_type)
{
    ASSERT(oop->oo_seqid_inuse);
    oop->oo_seqid = seqid;
    oop->oo_last_good_seqid = seqid;
    oop->oo_last_good_op = tag_type;
}

/*
 * This bumps the current open seqid for the open owner 'oop'.
 */
void
nfs4_get_and_set_next_open_seqid(nfs4_open_owner_t *oop,
    nfs4_tag_type_t tag_type)
{
    ASSERT(oop->oo_seqid_inuse);
    oop->oo_seqid++;
    oop->oo_last_good_seqid = oop->oo_seqid;
    oop->oo_last_good_op = tag_type;
}

/*
 * If no open owner was provided, this function takes the cred to find an
 * open owner within the given mntinfo4_t.  Either way we return the
 * open owner's OTW credential if it exists; otherwise returns the
 * supplied 'cr'.
 *
 * A hold is put on the returned credential, and it is up to the caller
 * to free the cred.
 */
cred_t *
nfs4_get_otw_cred(cred_t *cr, mntinfo4_t *mi, nfs4_open_owner_t *provided_oop)
{
    cred_t *ret_cr;
    nfs4_open_owner_t *oop = provided_oop;

    if (oop == NULL)
        oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
    if (oop != NULL) {
        mutex_enter(&oop->oo_lock);
        if (oop->oo_cred_otw)
            ret_cr = oop->oo_cred_otw;
        else
            ret_cr = cr;
        crhold(ret_cr);
        mutex_exit(&oop->oo_lock);
        if (provided_oop == NULL)
            open_owner_rele(oop);
    } else {
        ret_cr = cr;
        crhold(ret_cr);
    }
    return (ret_cr);
}

/*
 * Retrieves the next open stream in the rnode's list if an open stream
 * is provided; otherwise gets the first open stream in the list.
 * The open owner for that open stream is then retrieved, and if its
 * oo_cred_otw exists then it is returned; otherwise the provided 'cr'
 * is returned.  *osp is set to the 'found' open stream.
 *
 * Note: we don't set *osp to the open stream retrieved via the
 * optimized check since that won't necessarily be at the beginning
 * of the rnode list, and if that osp doesn't work we'd like to
 * check _all_ open streams (starting from the beginning of the
 * rnode list).
 */
cred_t *
nfs4_get_otw_cred_by_osp(rnode4_t *rp, cred_t *cr,
    nfs4_open_stream_t **osp, bool_t *first_time, bool_t *last_time)
{
    nfs4_open_stream_t *next_osp = NULL;
    cred_t *ret_cr;

    ASSERT(cr != NULL);
    /*
     * As an optimization, try to find the open owner
     * for the cred provided since that's most likely
     * to work.
     */
    if (*first_time) {
        nfs4_open_owner_t *oop;

        oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(RTOV4(rp)));
        if (oop) {
            next_osp = find_open_stream(oop, rp);
            if (next_osp)
                mutex_exit(&next_osp->os_sync_lock);
            open_owner_rele(oop);
        }
    }
    if (next_osp == NULL) {
        int delay_rele = 0;
        *first_time = FALSE;

        /* return the next open stream for this rnode */
        mutex_enter(&rp->r_os_lock);
        /* Now, no one can add or delete to rp's open streams list */

        if (*osp) {
            next_osp = list_next(&rp->r_open_streams, *osp);
            /*
             * Delay the rele of *osp until after we drop
             * r_os_lock to not deadlock with oo_lock
             * via an open_stream_rele()->open_owner_rele().
             */
            delay_rele = 1;
        } else {
            next_osp = list_head(&rp->r_open_streams);
        }
        if (next_osp) {
            nfs4_open_stream_t *tmp_osp;

            /* find the next valid open stream */
            mutex_enter(&next_osp->os_sync_lock);
            while (next_osp && !next_osp->os_valid) {
                tmp_osp =
                    list_next(&rp->r_open_streams, next_osp);
                mutex_exit(&next_osp->os_sync_lock);
                next_osp = tmp_osp;
                if (next_osp)
                    mutex_enter(&next_osp->os_sync_lock);
            }
            if (next_osp) {
                next_osp->os_ref_count++;
                mutex_exit(&next_osp->os_sync_lock);
            }
        }
        mutex_exit(&rp->r_os_lock);
        if (delay_rele)
            open_stream_rele(*osp, rp);
    }

    if (next_osp) {
        nfs4_open_owner_t *oop;

        oop = next_osp->os_open_owner;
        mutex_enter(&oop->oo_lock);
        if (oop->oo_cred_otw)
            ret_cr = oop->oo_cred_otw;
        else
            ret_cr = cr;
        crhold(ret_cr);
        mutex_exit(&oop->oo_lock);
        if (*first_time) {
            open_stream_rele(next_osp, rp);
            *osp = NULL;
        } else
            *osp = next_osp;
    } else {
        /* just return the cred provided to us */
        *last_time = TRUE;
        *osp = NULL;
        ret_cr = cr;
        crhold(ret_cr);
    }

    *first_time = FALSE;
    return (ret_cr);
}

void
nfs4_init_stateid_types(nfs4_stateid_types_t *sid_tp)
{
    bzero(&sid_tp->d_sid, sizeof (stateid4));
    bzero(&sid_tp->l_sid, sizeof (stateid4));
    bzero(&sid_tp->o_sid, sizeof (stateid4));
    sid_tp->cur_sid_type = NO_SID;
}

void
nfs4_save_stateid(stateid4 *s1, nfs4_stateid_types_t *sid_tp)
{
    NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
        "nfs4_save_stateid: saved %s stateid",
        sid_tp->cur_sid_type == DEL_SID ? "delegation" :
        sid_tp->cur_sid_type == LOCK_SID ? "lock" :
        sid_tp->cur_sid_type == OPEN_SID ? "open" : "special"));

    switch (sid_tp->cur_sid_type) {
    case DEL_SID:
        sid_tp->d_sid = *s1;
        break;
    case LOCK_SID:
        sid_tp->l_sid = *s1;
        break;
    case OPEN_SID:
        sid_tp->o_sid = *s1;
        break;
    case SPEC_SID:
    default:
        cmn_err(CE_PANIC, "nfs4_save_stateid: illegal "
            "stateid type %d", sid_tp->cur_sid_type);
    }
}

/*
 * We got NFS4ERR_BAD_SEQID.  Setup some arguments to pass to recovery.
 * Caller is responsible for freeing.
 */
nfs4_bseqid_entry_t *
nfs4_create_bseqid_entry(nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop,
    vnode_t *vp, pid_t pid, nfs4_tag_type_t tag, seqid4 seqid)
{
    nfs4_bseqid_entry_t *bsep;

    bsep = kmem_alloc(sizeof (*bsep), KM_SLEEP);
    bsep->bs_oop = oop;
    bsep->bs_lop = lop;
    bsep->bs_vp = vp;
    bsep->bs_pid = pid;
    bsep->bs_tag = tag;
    bsep->bs_seqid = seqid;

    return (bsep);
}

void
nfs4open_dg_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
    nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
    vnode_t *vp, int access_close, int deny_close)
{
    lost_rqstp->lr_putfirst = FALSE;

    ASSERT(vp != NULL);
    if (error == ETIMEDOUT || error == EINTR ||
        NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
        NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
            "nfs4open_dg_save_lost_rqst: error %d", error));

        lost_rqstp->lr_op = OP_OPEN_DOWNGRADE;
        /*
         * The vp is held and rele'd via the recovery code.
         * See nfs4_save_lost_rqst.
         */
        lost_rqstp->lr_vp = vp;
        lost_rqstp->lr_dvp = NULL;
        lost_rqstp->lr_oop = oop;
        lost_rqstp->lr_osp = osp;
        lost_rqstp->lr_lop = NULL;
        lost_rqstp->lr_cr = cr;
        lost_rqstp->lr_flk = NULL;
        lost_rqstp->lr_dg_acc = access_close;
        lost_rqstp->lr_dg_deny = deny_close;
        lost_rqstp->lr_putfirst = FALSE;
    } else {
        lost_rqstp->lr_op = 0;
    }
}

/*
 * Change the access and deny bits of an OPEN.
 * If recovery is needed, *recov_credpp is set to the cred used OTW,
 * a hold is placed on it, and *recov_seqidp is set to the seqid used OTW.
 */
void
nfs4_open_downgrade(int access_close, int deny_close, nfs4_open_owner_t *oop,
    nfs4_open_stream_t *osp, vnode_t *vp, cred_t *cr, nfs4_lost_rqst_t *lrp,
    nfs4_error_t *ep, cred_t **recov_credpp, seqid4 *recov_seqidp)
{
    mntinfo4_t      *mi;
    int         downgrade_acc, downgrade_deny;
    int         new_acc, new_deny;
    COMPOUND4args_clnt  args;
    COMPOUND4res_clnt   res;
    OPEN_DOWNGRADE4res  *odg_res;
    nfs_argop4      argop[3];
    nfs_resop4      *resop;
    rnode4_t        *rp;
    bool_t          needrecov = FALSE;
    int         doqueue = 1;
    seqid4          seqid = 0;
    cred_t          *cred_otw;
    hrtime_t        t;

    ASSERT(mutex_owned(&osp->os_sync_lock));
#if DEBUG
    mutex_enter(&oop->oo_lock);
    ASSERT(oop->oo_seqid_inuse);
    mutex_exit(&oop->oo_lock);
#endif


    if (access_close == 0 && deny_close == 0) {
        nfs4_error_zinit(ep);
        return;
    }

    cred_otw = nfs4_get_otw_cred(cr, VTOMI4(vp), oop);

cred_retry:
    nfs4_error_zinit(ep);
    downgrade_acc = 0;
    downgrade_deny = 0;
    mi = VTOMI4(vp);
    rp = VTOR4(vp);

    /*
     * Check to see if the open stream got closed before we go OTW,
     * now that we have acquired the 'os_sync_lock'.
     */
    if (!osp->os_valid) {
        NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
            " open stream has already been closed, return success"));
        /* error has already been set */
        goto no_args_out;
    }

    /* If the file failed recovery, just quit. */
    mutex_enter(&rp->r_statelock);
    if (rp->r_flags & R4RECOVERR) {
        mutex_exit(&rp->r_statelock);
        ep->error = EIO;
        goto no_args_out;
    }
    mutex_exit(&rp->r_statelock);

    seqid = nfs4_get_open_seqid(oop) + 1;

    NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
        "access_close %d, acc_read %"PRIu64" acc_write %"PRIu64"",
        access_close, osp->os_share_acc_read, osp->os_share_acc_write));

    /* If we're closing the last READ, need to downgrade */
    if ((access_close & FREAD) && (osp->os_share_acc_read == 1))
        downgrade_acc |= OPEN4_SHARE_ACCESS_READ;

    /* if we're closing the last WRITE, need to downgrade */
    if ((access_close & FWRITE) && (osp->os_share_acc_write == 1))
        downgrade_acc |= OPEN4_SHARE_ACCESS_WRITE;

    downgrade_deny = OPEN4_SHARE_DENY_NONE;

    new_acc = 0;
    new_deny = 0;

    /* set our new access and deny share bits */
    if ((osp->os_share_acc_read > 0) &&
        !(downgrade_acc & OPEN4_SHARE_ACCESS_READ))
        new_acc |= OPEN4_SHARE_ACCESS_READ;
    if ((osp->os_share_acc_write > 0) &&
        !(downgrade_acc & OPEN4_SHARE_ACCESS_WRITE))
        new_acc |= OPEN4_SHARE_ACCESS_WRITE;

    new_deny = OPEN4_SHARE_DENY_NONE;

    NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
        "downgrade acc 0x%x deny 0x%x", downgrade_acc, downgrade_deny));
    NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
        "new acc 0x%x deny 0x%x", new_acc, new_deny));

    /*
     * Check to see if we aren't actually doing any downgrade or
     * if this is the last 'close' but the file is still mmapped.
     * Skip this if this a lost request resend so we don't decrement
     * the osp's share counts more than once.
     */
    if (!lrp &&
        ((downgrade_acc == 0 && downgrade_deny == 0) ||
        (new_acc == 0 && new_deny == 0))) {
        /*
         * No downgrade to do, but still need to
         * update osp's os_share_* counts.
         */
        NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE,
            "nfs4_open_downgrade: just lower the osp's count by %s",
            (access_close & FREAD) && (access_close & FWRITE) ?
            "read and write" : (access_close & FREAD) ? "read" :
            (access_close & FWRITE) ? "write" : "bogus"));
        if (access_close & FREAD)
            osp->os_share_acc_read--;
        if (access_close & FWRITE)
            osp->os_share_acc_write--;
        osp->os_share_deny_none--;
        nfs4_error_zinit(ep);

        goto no_args_out;
    }

    if (osp->os_orig_oo_name != oop->oo_name) {
        ep->error = EIO;
        goto no_args_out;
    }

    /* setup the COMPOUND args */
    if (lrp)
        args.ctag = TAG_OPEN_DG_LOST;
    else
        args.ctag = TAG_OPEN_DG;

    args.array_len = 3;
    args.array = argop;

    /* putfh */
    argop[0].argop = OP_CPUTFH;
    argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;

    argop[1].argop = OP_GETATTR;
    argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
    argop[1].nfs_argop4_u.opgetattr.mi = mi;

    ASSERT(mutex_owned(&osp->os_sync_lock));
    ASSERT(osp->os_delegation == FALSE);

    /* open downgrade */
    argop[2].argop = OP_OPEN_DOWNGRADE;
    argop[2].nfs_argop4_u.opopen_downgrade.open_stateid = osp->open_stateid;
    argop[2].nfs_argop4_u.opopen_downgrade.share_access = new_acc;
    argop[2].nfs_argop4_u.opopen_downgrade.share_deny = new_deny;
    argop[2].nfs_argop4_u.opopen_downgrade.seqid = seqid;

    t = gethrtime();

    rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);

    if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
        nfs4_set_open_seqid(seqid, oop, args.ctag);

    if ((ep->error == EACCES ||
        (ep->error == 0 && res.status == NFS4ERR_ACCESS)) &&
        cred_otw != cr) {
        crfree(cred_otw);
        cred_otw = cr;
        crhold(cred_otw);
        if (!ep->error)
            (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
        goto cred_retry;
    }

    needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);

    if (needrecov && recov_credpp) {
        *recov_credpp = cred_otw;
        crhold(*recov_credpp);
        if (recov_seqidp)
            *recov_seqidp = seqid;
    }

    if (!ep->error && !res.status) {
        /* get the open downgrade results */
        resop = &res.array[2];
        odg_res = &resop->nfs_resop4_u.opopen_downgrade;

        osp->open_stateid = odg_res->open_stateid;

        /* set the open streams new access/deny bits */
        if (access_close & FREAD)
            osp->os_share_acc_read--;
        if (access_close & FWRITE)
            osp->os_share_acc_write--;
        osp->os_share_deny_none--;
        osp->os_dc_openacc = new_acc;

        nfs4_attr_cache(vp,
            &res.array[1].nfs_resop4_u.opgetattr.ga_res,
            t, cred_otw, TRUE, NULL);
    }

    if (!ep->error)
        (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);

no_args_out:
    crfree(cred_otw);
}

/*
 * If an OPEN request gets ETIMEDOUT or EINTR (that includes bailing out
 * because the filesystem was forcibly unmounted) then we don't know if we
 * potentially left state dangling on the server, therefore the recovery
 * framework makes this call to resend the OPEN request and then undo it.
 */
void
nfs4_resend_open_otw(vnode_t **vpp, nfs4_lost_rqst_t *resend_rqstp,
    nfs4_error_t *ep)
{
    COMPOUND4args_clnt  args;
    COMPOUND4res_clnt   res;
    nfs_argop4      argop[4];
    GETFH4res       *gf_res = NULL;
    OPEN4cargs      *open_args;
    OPEN4res        *op_res;
    char            *destcfp;
    int         destclen;
    nfs4_ga_res_t       *garp;
    vnode_t         *dvp = NULL, *vp = NULL;
    rnode4_t        *rp = NULL, *drp = NULL;
    cred_t          *cr = NULL;
    seqid4          seqid;
    nfs4_open_owner_t   *oop = NULL;
    nfs4_open_stream_t  *osp = NULL;
    component4      *srcfp;
    open_claim_type4    claim;
    mntinfo4_t      *mi;
    int         doqueue = 1;
    bool_t          retry_open = FALSE;
    int         created_osp = 0;
    hrtime_t        t;
    char            *failed_msg = "";
    int         fh_different;
    int         reopen = 0;

    nfs4_error_zinit(ep);

    cr = resend_rqstp->lr_cr;
    dvp = resend_rqstp->lr_dvp;

    vp = *vpp;
    if (vp) {
        ASSERT(nfs4_consistent_type(vp));
        rp = VTOR4(vp);
    }

    if (rp) {
        /* If the file failed recovery, just quit. */
        mutex_enter(&rp->r_statelock);
        if (rp->r_flags & R4RECOVERR) {
            mutex_exit(&rp->r_statelock);
            ep->error = EIO;
            return;
        }
        mutex_exit(&rp->r_statelock);
    }

    if (dvp) {
        drp = VTOR4(dvp);
        /* If the parent directory failed recovery, just quit. */
        mutex_enter(&drp->r_statelock);
        if (drp->r_flags & R4RECOVERR) {
            mutex_exit(&drp->r_statelock);
            ep->error = EIO;
            return;
        }
        mutex_exit(&drp->r_statelock);
    } else
        reopen = 1; /* NULL dvp means this is a reopen */

    claim = resend_rqstp->lr_oclaim;
    ASSERT(claim == CLAIM_NULL || claim == CLAIM_DELEGATE_CUR);

    args.ctag = TAG_OPEN_LOST;
    args.array_len = 4;
    args.array = argop;

    argop[0].argop = OP_CPUTFH;
    if (reopen) {
        ASSERT(vp != NULL);

        mi = VTOMI4(vp);
        /*
         * if this is a file mount then
         * use the mntinfo parentfh
         */
        argop[0].nfs_argop4_u.opcputfh.sfh =
            (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
            VTOSV(vp)->sv_dfh;
        args.ctag = TAG_REOPEN_LOST;
    } else {
        argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
        mi = VTOMI4(dvp);
    }

    argop[1].argop = OP_COPEN;
    open_args = &argop[1].nfs_argop4_u.opcopen;
    open_args->claim = claim;

    /*
     * If we sent over a OPEN with CREATE then the only
     * thing we care about is to not leave dangling state
     * on the server, not whether the file we potentially
     * created remains on the server.  So even though the
     * lost open request specified a CREATE, we only wish
     * to do a non-CREATE OPEN.
     */
    open_args->opentype = OPEN4_NOCREATE;

    srcfp = &resend_rqstp->lr_ofile;
    destclen = srcfp->utf8string_len;
    destcfp = kmem_alloc(destclen + 1, KM_SLEEP);
    bcopy(srcfp->utf8string_val, destcfp, destclen);
    destcfp[destclen] = '\0';
    if (claim == CLAIM_DELEGATE_CUR) {
        open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
            resend_rqstp->lr_ostateid;
        open_args->open_claim4_u.delegate_cur_info.cfile = destcfp;
    } else {
        open_args->open_claim4_u.cfile = destcfp;
    }

    open_args->share_access = resend_rqstp->lr_oacc;
    open_args->share_deny = resend_rqstp->lr_odeny;
    oop = resend_rqstp->lr_oop;
    ASSERT(oop != NULL);

    open_args->owner.clientid = mi2clientid(mi);
    /* this length never changes */
    open_args->owner.owner_len = sizeof (oop->oo_name);
    open_args->owner.owner_val =
        kmem_alloc(open_args->owner.owner_len, KM_SLEEP);

    ep->error = nfs4_start_open_seqid_sync(oop, mi);
    ASSERT(ep->error == 0);     /* recov thread always succeeds */
    /*
     * We can get away with not saving the seqid upon detection
     * of a lost request, and now just use the open owner's current
     * seqid since we only allow one op OTW per seqid and lost
     * requests are saved FIFO.
     */
    seqid = nfs4_get_open_seqid(oop) + 1;
    open_args->seqid = seqid;

    bcopy(&oop->oo_name, open_args->owner.owner_val,
        open_args->owner.owner_len);

    /* getfh */
    argop[2].argop = OP_GETFH;

    /* Construct the getattr part of the compound */
    argop[3].argop = OP_GETATTR;
    argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
    argop[3].nfs_argop4_u.opgetattr.mi = mi;

    res.array = NULL;

    t = gethrtime();

    rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);

    if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
        nfs4_set_open_seqid(seqid, oop, args.ctag);

    NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
        "nfs4_resend_open_otw: error %d stat %d", ep->error, res.status));

    if (ep->error || res.status)
        goto err_out;

    op_res = &res.array[1].nfs_resop4_u.opopen;
    gf_res = &res.array[2].nfs_resop4_u.opgetfh;
    garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;

    if (!vp) {
        int rnode_err = 0;
        nfs4_sharedfh_t *sfh;

        /*
         * If we can't decode all the attributes they are not usable,
         * just make the vnode.
         */

        sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
        *vpp = makenfs4node(sfh, garp, dvp->v_vfsp, t, cr, dvp,
            fn_get(VTOSV(dvp)->sv_name,
            open_args->open_claim4_u.cfile, sfh));
        sfh4_rele(&sfh);
        NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
            "nfs4_resend_open_otw: made vp %p for file %s",
            (void *)(*vpp), open_args->open_claim4_u.cfile));

        if (ep->error)
            PURGE_ATTRCACHE4(*vpp);

        /*
         * For the newly created *vpp case, make sure the rnode
         * isn't bad before using it.
         */
        mutex_enter(&(VTOR4(*vpp))->r_statelock);
        if (VTOR4(*vpp)->r_flags & R4RECOVERR)
            rnode_err = EIO;
        mutex_exit(&(VTOR4(*vpp))->r_statelock);

        if (rnode_err) {
            NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
                "nfs4_resend_open_otw: rp %p is bad",
                (void *)VTOR4(*vpp)));
            ep->error = rnode_err;
            goto err_out;
        }

        vp = *vpp;
        rp = VTOR4(vp);
    }

    if (reopen) {
        /*
         * Check if the path we reopened really is the same
         * file. We could end up in a situation were the file
         * was removed and a new file created with the same name.
         */
        (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
        fh_different =
            (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
        if (fh_different) {
            if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
                mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
                /* Oops, we don't have the same file */
                if (mi->mi_fh_expire_type == FH4_PERSISTENT)
                    failed_msg =
                        "Couldn't reopen: Persistant "
                        "file handle changed";
                else
                    failed_msg =
                        "Couldn't reopen: Volatile "
                        "(no expire on open) file handle "
                        "changed";

                nfs4_end_open_seqid_sync(oop);
                kmem_free(destcfp, destclen + 1);
                nfs4args_copen_free(open_args);
                (void) xdr_free(xdr_COMPOUND4res_clnt,
                    (caddr_t)&res);
                nfs_rw_exit(&mi->mi_fh_lock);
                nfs4_fail_recov(vp, failed_msg, ep->error,
                    ep->stat);
                return;
            } else {
                /*
                 * We have volatile file handles that don't
                 * compare.  If the fids are the same then we
                 * assume that the file handle expired but the
                 * renode still refers to the same file object.
                 *
                 * First check that we have fids or not.
                 * If we don't we have a dumb server so we will
                 * just assume every thing is ok for now.
                 */
                if (!ep->error &&
                    garp->n4g_va.va_mask & AT_NODEID &&
                    rp->r_attr.va_mask & AT_NODEID &&
                    rp->r_attr.va_nodeid !=
                    garp->n4g_va.va_nodeid) {
                    /*
                     * We have fids, but they don't
                     * compare. So kill the file.
                     */
                    failed_msg =
                        "Couldn't reopen: file handle "
                        "changed due to mismatched fids";
                    nfs4_end_open_seqid_sync(oop);
                    kmem_free(destcfp, destclen + 1);
                    nfs4args_copen_free(open_args);
                    (void) xdr_free(xdr_COMPOUND4res_clnt,
                        (caddr_t)&res);
                    nfs_rw_exit(&mi->mi_fh_lock);
                    nfs4_fail_recov(vp, failed_msg,
                        ep->error, ep->stat);
                    return;
                } else {
                    /*
                     * We have volatile file handles that
                     * refers to the same file (at least
                     * they have the same fid) or we don't
                     * have fids so we can't tell. :(. We'll
                     * be a kind and accepting client so
                     * we'll update the rnode's file
                     * handle with the otw handle.
                     *
                     * We need to drop mi->mi_fh_lock since
                     * sh4_update acquires it. Since there
                     * is only one recovery thread there is
                     * no race.
                     */
                    nfs_rw_exit(&mi->mi_fh_lock);
                    sfh4_update(rp->r_fh, &gf_res->object);
                }
            }
        } else {
            nfs_rw_exit(&mi->mi_fh_lock);
        }
    }

    ASSERT(nfs4_consistent_type(vp));

    if (op_res->rflags & OPEN4_RESULT_CONFIRM)
        nfs4open_confirm(vp, &seqid, &op_res->stateid, cr, TRUE,
            &retry_open, oop, TRUE, ep, NULL);
    if (ep->error || ep->stat) {
        nfs4_end_open_seqid_sync(oop);
        kmem_free(destcfp, destclen + 1);
        nfs4args_copen_free(open_args);
        if (!ep->error)
            (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
        return;
    }

    if (reopen) {
        /*
         * Doing a reopen here so the osp should already exist.
         * If not, something changed or went very wrong.
         *
         * returns with 'os_sync_lock' held
         */
        osp = find_open_stream(oop, rp);
        if (!osp) {
            NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
                "nfs4_resend_open_otw: couldn't find osp"));
            ep->error = EINVAL;
            goto err_out;
        }
        osp->os_open_ref_count++;
    } else {
        mutex_enter(&oop->oo_lock);
        oop->oo_just_created = NFS4_PERM_CREATED;
        mutex_exit(&oop->oo_lock);

        /* returns with 'os_sync_lock' held */
        osp = find_or_create_open_stream(oop, rp, &created_osp);
        if (!osp) {
            NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
                "nfs4_resend_open_otw: couldn't create osp"));
            ep->error = EINVAL;
            goto err_out;
        }
    }

    osp->open_stateid = op_res->stateid;
    osp->os_delegation = FALSE;
    /*
     * Need to reset this bitfield for the possible case where we were
     * going to OTW CLOSE the file, got a non-recoverable error, and before
     * we could retry the CLOSE, OPENed the file again.
     */
    ASSERT(osp->os_open_owner->oo_seqid_inuse);
    osp->os_final_close = 0;
    osp->os_force_close = 0;

    if (!reopen) {
        if (open_args->share_access & OPEN4_SHARE_ACCESS_READ)
            osp->os_share_acc_read++;
        if (open_args->share_access & OPEN4_SHARE_ACCESS_WRITE)
            osp->os_share_acc_write++;
        osp->os_share_deny_none++;
    }

    mutex_exit(&osp->os_sync_lock);
    if (created_osp)
        nfs4_inc_state_ref_count(mi);
    open_stream_rele(osp, rp);

    nfs4_end_open_seqid_sync(oop);

    /* accept delegation, if any */
    nfs4_delegation_accept(rp, claim, op_res, garp, cr);

    kmem_free(destcfp, destclen + 1);
    nfs4args_copen_free(open_args);

    if (claim == CLAIM_DELEGATE_CUR)
        nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
    else
        PURGE_ATTRCACHE4(vp);

    (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);

    ASSERT(nfs4_consistent_type(vp));

    return;

err_out:
    nfs4_end_open_seqid_sync(oop);
    kmem_free(destcfp, destclen + 1);
    nfs4args_copen_free(open_args);
    if (!ep->error)
        (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
}