ns/rdc/rdc_dev.c

	rdc_dev.c revision fcf3ce441efd61da9bb2884968af01cb7c1452cc
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */


#include <sys/types.h>
#include <sys/ksynch.h>
#include <sys/kmem.h>
#include <sys/errno.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/cred.h>
#include <sys/file.h>
#include <sys/ddi.h>
#include <sys/nsc_thread.h>
#include <sys/unistat/spcs_s.h>
#include <sys/unistat/spcs_errors.h>

#include <sys/unistat/spcs_s_k.h>
#ifdef DS_DDICT
#include "../contract.h"
#endif

#include <sys/nsctl/nsctl.h>

#include <sys/sdt.h>        /* dtrace is S10 or later */

#include "rdc.h"
#include "rdc_io.h"
#include "rdc_bitmap.h"

/*
 * Remote Dual Copy
 *
 * This file contains the nsctl io provider functionality for RDC.
 *
 * RDC is implemented as a simple filter module that pushes itself between
 * user (SIMCKD, STE, etc.) and SDBC.
 */


static int _rdc_open_count;
int rdc_eio_nobmp = 0;

nsc_io_t *_rdc_io_hc;
static nsc_io_t *_rdc_io_hr;
static nsc_def_t _rdc_fd_def[], _rdc_io_def[], _rdc_ior_def[];

void _rdc_deinit_dev();
int rdc_diskq_enqueue(rdc_k_info_t *, rdc_aio_t *);
extern void rdc_unintercept_diskq(rdc_group_t *);
rdc_aio_t *rdc_aio_tbuf_get(void *, void *, int, int, int, int, int);

static nsc_buf_t *_rdc_alloc_handle(void (*)(), void (*)(),
    void (*)(), rdc_fd_t *);
static int _rdc_free_handle(rdc_buf_t *, rdc_fd_t *);

#ifdef DEBUG
int rdc_overlap_cnt;
int rdc_overlap_hnd_cnt;
#endif

static rdc_info_dev_t *rdc_devices;

extern int _rdc_rsrv_diskq(rdc_group_t *group);
extern void _rdc_rlse_diskq(rdc_group_t *group);

/*
 * _rdc_init_dev
 *  Initialise the io provider.
 */

int
_rdc_init_dev()
{
    _rdc_io_hc = nsc_register_io("rdc-high-cache",
        NSC_RDCH_ID|NSC_REFCNT|NSC_FILTER, _rdc_io_def);
    if (_rdc_io_hc == NULL)
        cmn_err(CE_WARN, "rdc: nsc_register_io (high, cache) failed.");

    _rdc_io_hr = nsc_register_io("rdc-high-raw",
        NSC_RDCHR_ID|NSC_REFCNT|NSC_FILTER, _rdc_ior_def);
    if (_rdc_io_hr == NULL)
        cmn_err(CE_WARN, "rdc: nsc_register_io (high, raw) failed.");

    if (!_rdc_io_hc || !_rdc_io_hr) {
        _rdc_deinit_dev();
        return (ENOMEM);
    }

    return (0);
}


/*
 * _rdc_deinit_dev
 *  De-initialise the io provider.
 *
 */

void
_rdc_deinit_dev()
{
    int rc;

    if (_rdc_io_hc) {
        if ((rc = nsc_unregister_io(_rdc_io_hc, 0)) != 0)
            cmn_err(CE_WARN,
                "rdc: nsc_unregister_io (high, cache) failed: %d",
                rc);
    }

    if (_rdc_io_hr) {
        if ((rc = nsc_unregister_io(_rdc_io_hr, 0)) != 0)
            cmn_err(CE_WARN,
                "rdc: nsc_unregister_io (high, raw) failed: %d",
                rc);
    }
}


/*
 * rdc_idev_open
 * - Open the nsctl file descriptors for the data devices.
 *
 * Must be called with rdc_conf_lock held.
 * id_sets is protected by rdc_conf_lock.
 */
static rdc_info_dev_t *
rdc_idev_open(rdc_k_info_t *krdc, char *pathname, int *rc)
{
    rdc_info_dev_t *dp;

    ASSERT(MUTEX_HELD(&rdc_conf_lock));

    for (dp = rdc_devices; dp; dp = dp->id_next) {
        if (dp->id_cache_dev.bi_fd &&
            strcmp(pathname, nsc_pathname(dp->id_cache_dev.bi_fd)) == 0)
            break;
    }

    if (!dp) {
        dp = kmem_zalloc(sizeof (*dp), KM_SLEEP);
        if (!dp)
            return (NULL);

        dp->id_cache_dev.bi_krdc = krdc;
        dp->id_cache_dev.bi_fd = nsc_open(pathname,
            NSC_RDCHR_ID|NSC_RDWR|NSC_DEVICE,
            _rdc_fd_def, (blind_t)&dp->id_cache_dev, rc);
        if (!dp->id_cache_dev.bi_fd) {
            kmem_free(dp, sizeof (*dp));
            return (NULL);
        }

        dp->id_raw_dev.bi_krdc = krdc;
        dp->id_raw_dev.bi_fd = nsc_open(pathname,
            NSC_RDCHR_ID|NSC_RDWR|NSC_DEVICE,
            _rdc_fd_def, (blind_t)&dp->id_raw_dev, rc);
        if (!dp->id_raw_dev.bi_fd) {
            (void) nsc_close(dp->id_cache_dev.bi_fd);
            kmem_free(dp, sizeof (*dp));
            return (NULL);
        }

        mutex_init(&dp->id_rlock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&dp->id_rcv, NULL, CV_DRIVER, NULL);

        dp->id_next = rdc_devices;
        rdc_devices = dp;
    }

    dp->id_sets++;
    return (dp);
}


/*
 * rdc_idev_close
 * - Close the nsctl file descriptors for the data devices.
 *
 * Must be called with rdc_conf_lock and dp->id_rlock held.
 * Will release dp->id_rlock before returning.
 *
 * id_sets is protected by rdc_conf_lock.
 */
static void
rdc_idev_close(rdc_k_info_t *krdc, rdc_info_dev_t *dp)
{
    rdc_info_dev_t **dpp;
#ifdef DEBUG
    int count = 0;
#endif

    ASSERT(MUTEX_HELD(&rdc_conf_lock));
    ASSERT(MUTEX_HELD(&dp->id_rlock));

    dp->id_sets--;
    if (dp->id_sets > 0) {
        mutex_exit(&dp->id_rlock);
        return;
    }

    /* external references must have gone */
    ASSERT((krdc->c_ref + krdc->r_ref + krdc->b_ref) == 0);

    /* unlink from chain */

    for (dpp = &rdc_devices; *dpp; dpp = &((*dpp)->id_next)) {
        if (*dpp == dp) {
            /* unlink */
            *dpp = dp->id_next;
            break;
        }
    }

    /*
     * Wait for all reserves to go away - the rpc server is
     * running asynchronously with this close, and so we
     * have to wait for it to spot that the krdc is !IS_ENABLED()
     * and throw away the nsc_buf_t's that it has allocated
     * and release the device.
     */

    while (IS_CRSRV(krdc) || IS_RRSRV(krdc)) {
#ifdef DEBUG
        if (!(++count % 16)) {
            cmn_err(CE_NOTE,
                "_rdc_idev_close(%s): waiting for nsc_release",
                rdc_u_info[krdc->index].primary.file);
        }
        if (count > (16*20)) {
            /* waited for 20 seconds - too long - panic */
            cmn_err(CE_PANIC,
                "_rdc_idev_close(%s, %p): lost nsc_release",
                rdc_u_info[krdc->index].primary.file,
                (void *)krdc);
        }
#endif
        mutex_exit(&dp->id_rlock);
        delay(HZ>>4);
        mutex_enter(&dp->id_rlock);
    }

    if (dp->id_cache_dev.bi_fd) {
        (void) nsc_close(dp->id_cache_dev.bi_fd);
        dp->id_cache_dev.bi_fd = NULL;
    }

    if (dp->id_raw_dev.bi_fd) {
        (void) nsc_close(dp->id_raw_dev.bi_fd);
        dp->id_raw_dev.bi_fd = NULL;
    }

    mutex_exit(&dp->id_rlock);
    mutex_destroy(&dp->id_rlock);
    cv_destroy(&dp->id_rcv);

    kmem_free(dp, sizeof (*dp));
}


/*
 * This function provokes an nsc_reserve() for the device which
 * if successful will populate krdc->maxfbas and urdc->volume_size
 * via the _rdc_attach_fd() callback.
 */
void
rdc_get_details(rdc_k_info_t *krdc)
{
    int rc;
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    nsc_size_t vol_size, maxfbas;

    if (_rdc_rsrv_devs(krdc, RDC_RAW, RDC_INTERNAL) == 0) {
        /*
         * if the vol is already reserved,
         * volume_size won't be populated on enable because
         * it is a *fake* reserve and does not make it to
         * _rdc_attach_fd(). So do it here.
         */
        rc = nsc_partsize(RDC_U_FD(krdc), &vol_size);
        if (rc != 0) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "rdc_get_details: partsize failed (%d)", rc);
#endif /* DEBUG */
            urdc->volume_size = vol_size = 0;
        }

        urdc->volume_size = vol_size;
        rc = nsc_maxfbas(RDC_U_FD(krdc), 0, &maxfbas);
        if (rc != 0) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "rdc_get_details: maxfbas failed (%d)", rc);
#endif /* DEBUG */
            maxfbas = 0;
        }
        krdc->maxfbas = min(RDC_MAX_MAXFBAS, maxfbas);

        _rdc_rlse_devs(krdc, RDC_RAW);
    }
}


/*
 * Should only be used by the config code.
 */

int
rdc_dev_open(rdc_set_t *rdc_set, int options)
{
    rdc_k_info_t *krdc;
    int index;
    int rc;
    char *pathname;

    ASSERT(MUTEX_HELD(&rdc_conf_lock));

    if (options & RDC_OPT_PRIMARY)
        pathname = rdc_set->primary.file;
    else
        pathname = rdc_set->secondary.file;

    for (index = 0; index < rdc_max_sets; index++) {
        krdc = &rdc_k_info[index];

        if (!IS_CONFIGURED(krdc))
            break;
    }

    if (index == rdc_max_sets) {
#ifdef DEBUG
        cmn_err(CE_WARN, "rdc_dev_open: out of cd\'s");
#endif
        index = -EINVAL;
        goto out;
    }

    if (krdc->devices && (krdc->c_fd || krdc->r_fd)) {
#ifdef DEBUG
        cmn_err(CE_WARN, "rdc_dev_open: %s already open", pathname);
#endif
        index = -EINVAL;
        goto out;
    }

    _rdc_open_count++;

    krdc->devices = rdc_idev_open(krdc, pathname, &rc);
    if (!krdc->devices) {
        index = -rc;
        goto open_fail;
    }

    /*
     * Grab the device size and maxfbas now.
     */

    rdc_get_details(krdc);

out:
    return (index);

open_fail:
    _rdc_open_count--;

    return (index);
}


void
rdc_dev_close(rdc_k_info_t *krdc)
{
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];

    mutex_enter(&rdc_conf_lock);

    if (krdc->devices)
        mutex_enter(&krdc->devices->id_rlock);

#ifdef DEBUG
    if (!krdc->devices || !krdc->c_fd || !krdc->r_fd) {
        cmn_err(CE_WARN,
            "rdc_dev_close(%p): c_fd %p r_fd %p", (void *)krdc,
            (void *) (krdc->devices ? krdc->c_fd : 0),
            (void *) (krdc->devices ? krdc->r_fd : 0));
    }
#endif

    if (krdc->devices) {
        /* rdc_idev_close will release id_rlock */
        rdc_idev_close(krdc, krdc->devices);
        krdc->devices = NULL;
    }

    urdc->primary.file[0] = '\0';

    if (_rdc_open_count <= 0) {
        cmn_err(CE_WARN,
            "rdc: _rdc_open_count corrupt: %d",
            _rdc_open_count);
    }

    _rdc_open_count--;

    mutex_exit(&rdc_conf_lock);
}


/*
 * rdc_intercept
 *
 * Register for IO on this device with nsctl.
 *
 * For a 1-to-many primary we register for each krdc and let nsctl sort
 * out which it wants to be using. This means that we cannot tell which
 * krdc will receive the incoming io from nsctl, though we do know that
 * at any one time only one krdc will be 'attached' and so get io from
 * nsctl.
 *
 * So the krdc->many_next pointer is maintained as a circular list. The
 * result of these multiple nsc_register_paths is that we will see a
 * few more attach and detach io provider calls during enable/resume
 * and disable/suspend of the 1-to-many whilst nsctl settles down to
 * using a single krdc.
 *
 * The major advantage of this scheme is that nsctl sorts out all the
 * rdc_fd_t's so that they can only point to krdc's that are currently
 * active.
 */
int
rdc_intercept(rdc_k_info_t *krdc)
{
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    char *pathname;
    char *bitmap;

    if (rdc_get_vflags(urdc) & RDC_PRIMARY) {
        pathname = urdc->primary.file;
        bitmap = urdc->primary.bitmap;
    } else {
        pathname = urdc->secondary.file;
        bitmap = urdc->secondary.bitmap;
    }

    if (!krdc->b_tok)
        krdc->b_tok = nsc_register_path(bitmap, NSC_CACHE | NSC_DEVICE,
            _rdc_io_hc);

    if (!krdc->c_tok)
        krdc->c_tok = nsc_register_path(pathname, NSC_CACHE,
            _rdc_io_hc);

    if (!krdc->r_tok)
        krdc->r_tok = nsc_register_path(pathname, NSC_DEVICE,
            _rdc_io_hr);

    if (!krdc->c_tok || !krdc->r_tok) {
        (void) rdc_unintercept(krdc);
        return (ENXIO);
    }

    return (0);
}


static void
wait_unregistering(rdc_k_info_t *krdc)
{
    while (krdc->group->unregistering > 0)
        (void) cv_wait_sig(&krdc->group->unregistercv, &rdc_conf_lock);
}

static void
set_unregistering(rdc_k_info_t *krdc)
{
    wait_unregistering(krdc);

    krdc->group->unregistering++;
}

static void
wakeup_unregistering(rdc_k_info_t *krdc)
{
    if (krdc->group->unregistering <= 0)
        return;

    krdc->group->unregistering--;
    cv_broadcast(&krdc->group->unregistercv);
}


/*
 * rdc_unintercept
 *
 * Unregister for IO on this device.
 *
 * See comments above rdc_intercept.
 */
int
rdc_unintercept(rdc_k_info_t *krdc)
{
    int err = 0;
    int rc;
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];

    mutex_enter(&rdc_conf_lock);
    set_unregistering(krdc);
    krdc->type_flag |= RDC_UNREGISTER;
    mutex_exit(&rdc_conf_lock);

    if (krdc->r_tok) {
        rc = nsc_unregister_path(krdc->r_tok, 0);
        if (rc) {
            cmn_err(CE_WARN, "rdc: unregister rawfd %d", rc);
            err = rc;
        }
        krdc->r_tok = NULL;
    }

    if (krdc->c_tok) {
        rc = nsc_unregister_path(krdc->c_tok, 0);
        if (rc) {
            cmn_err(CE_WARN, "rdc: unregister cachefd %d", rc);
            if (!err)
                err = rc;
        }
        krdc->c_tok = NULL;
    }

    if (krdc->b_tok) {
        rc = nsc_unregister_path(krdc->b_tok, 0);
        if (rc) {
            cmn_err(CE_WARN, "rdc: unregister bitmap %d", rc);
            err = rc;
        }
        krdc->b_tok = NULL;
    }

    rdc_group_enter(krdc);

    /* Wait for all necessary _rdc_close() calls to complete */
    while ((krdc->c_ref + krdc->r_ref + krdc->b_ref) != 0) {
        krdc->closing++;
        cv_wait(&krdc->closingcv, &krdc->group->lock);
        krdc->closing--;
    }

    rdc_clr_flags(urdc, RDC_ENABLED);
    rdc_group_exit(krdc);


    /*
     * Check there are no outstanding writes in progress.
     * This can happen when a set is being disabled which
     * is one of the 'one_to_many' chain, that did not
     * intercept the original write call.
     */

    for (;;) {
        rdc_group_enter(krdc);
        if (krdc->aux_state & RDC_AUXWRITE) {
            rdc_group_exit(krdc);
            /*
             * This doesn't happen very often,
             * just delay a bit and re-look.
             */
            delay(50);
        } else {
            rdc_group_exit(krdc);
            break;
        }
    }

    mutex_enter(&rdc_conf_lock);
    krdc->type_flag &= ~RDC_UNREGISTER;
    wakeup_unregistering(krdc);
    mutex_exit(&rdc_conf_lock);

    return (err);
}


/*
 * _rdc_rlse_d
 *  Internal version of _rdc_rlse_devs(), only concerned with the
 *  data device, not the bitmap.
 */

static void
_rdc_rlse_d(rdc_k_info_t *krdc, int devs)
{
    _rdc_info_dev_t *cip;
    _rdc_info_dev_t *rip;
    int raw = (devs & RDC_RAW);

    if (!krdc) {
        cmn_err(CE_WARN, "rdc: _rdc_rlse_devs null krdc");
        return;
    }

    ASSERT((devs & (~RDC_BMP)) != 0);

    cip = &krdc->devices->id_cache_dev;
    rip = &krdc->devices->id_raw_dev;

    if (IS_RSRV(cip)) {
        /* decrement count */

        if (raw) {
            if (cip->bi_ofailed > 0) {
                cip->bi_ofailed--;
            } else if (cip->bi_orsrv > 0) {
                cip->bi_orsrv--;
            }
        } else {
            if (cip->bi_failed > 0) {
                cip->bi_failed--;
            } else if (cip->bi_rsrv > 0) {
                cip->bi_rsrv--;
            }
        }

        /*
         * reset nsc_fd ownership back link, it is only set if
         * we have really done an underlying reserve, not for
         * failed (faked) reserves.
         */

        if (cip->bi_rsrv > 0 || cip->bi_orsrv > 0) {
            nsc_set_owner(cip->bi_fd, krdc->iodev);
        } else {
            nsc_set_owner(cip->bi_fd, NULL);
        }

        /* release nsc_fd */

        if (!IS_RSRV(cip)) {
            nsc_release(cip->bi_fd);
        }
    } else if (IS_RSRV(rip)) {
        /* decrement count */

        if (raw) {
            if (rip->bi_failed > 0) {
                rip->bi_failed--;
            } else if (rip->bi_rsrv > 0) {
                rip->bi_rsrv--;
            }
        } else {
            if (rip->bi_ofailed > 0) {
                rip->bi_ofailed--;
            } else if (rip->bi_orsrv > 0) {
                rip->bi_orsrv--;
            }
        }

        /*
         * reset nsc_fd ownership back link, it is only set if
         * we have really done an underlying reserve, not for
         * failed (faked) reserves.
         */

        if (rip->bi_rsrv > 0 || rip->bi_orsrv > 0) {
            nsc_set_owner(rip->bi_fd, krdc->iodev);
        } else {
            nsc_set_owner(rip->bi_fd, NULL);
        }

        /* release nsc_fd and any waiters */

        if (!IS_RSRV(rip)) {
            rip->bi_flag = 0;
            nsc_release(rip->bi_fd);
            cv_broadcast(&krdc->devices->id_rcv);
        }
    } else {
        cmn_err(CE_WARN, "rdc: _rdc_rlse_devs no reserve? krdc %p",
            (void *) krdc);
    }
}

/*
 * _rdc_rlse_devs
 *  Release named underlying devices and take care of setting the
 *  back link on the nsc_fd to the correct parent iodev.
 *
 *  NOTE: the 'devs' argument must be the same as that passed to
 *  the preceding _rdc_rsrv_devs call.
 */

void
_rdc_rlse_devs(rdc_k_info_t *krdc, int devs)
{

    DTRACE_PROBE(_rdc_rlse_devs_start);
    mutex_enter(&krdc->devices->id_rlock);

    ASSERT(!(devs & RDC_CACHE));

    if ((devs & (~RDC_BMP)) != 0) {
        _rdc_rlse_d(krdc, devs);
    }

    if ((devs & RDC_BMP) != 0) {
        if (krdc->bmaprsrv > 0 && --krdc->bmaprsrv == 0) {
            nsc_release(krdc->bitmapfd);
        }
    }

    mutex_exit(&krdc->devices->id_rlock);

}

/*
 * _rdc_rsrv_d
 *  Reserve device flagged, unless its companion is already reserved,
 *  in that case increase the reserve on the companion.  Take care
 *  of setting the nsc_fd ownership back link to the correct parent
 *  iodev pointer.
 */

static int
_rdc_rsrv_d(int raw, _rdc_info_dev_t *rid, _rdc_info_dev_t *cid, int flag,
    rdc_k_info_t *krdc)
{
    _rdc_info_dev_t *p = NULL;
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    int other = 0;
    int rc;


#ifdef DEBUG
    if ((rid->bi_rsrv < 0) ||
        (cid->bi_rsrv < 0) ||
        (rid->bi_orsrv < 0) ||
        (cid->bi_orsrv < 0) ||
        (rid->bi_failed < 0) ||
        (cid->bi_failed < 0) ||
        (rid->bi_ofailed < 0) ||
        (cid->bi_ofailed < 0)) {
        cmn_err(CE_WARN,
            "_rdc_rsrv_d: negative counts (rsrv %d %d orsrv %d %d)",
            rid->bi_rsrv, cid->bi_rsrv,
            rid->bi_orsrv, cid->bi_orsrv);
        cmn_err(CE_WARN,
            "_rdc_rsrv_d: negative counts (fail %d %d ofail %d %d)",
            rid->bi_failed, cid->bi_failed,
            rid->bi_ofailed, cid->bi_ofailed);
        cmn_err(CE_PANIC, "_rdc_rsrv_d: negative counts (krdc %p)",
            (void *) krdc);
    }
#endif

    /*
     * If user wants to do a cache reserve and it's already
     * raw reserved internally, we need to do a real nsc_reserve, so wait
     * until the release has been done.
     */
    if (IS_RSRV(rid) && (flag == RDC_EXTERNAL) &&
        (raw == 0) && (rid->bi_flag != RDC_EXTERNAL)) {
        krdc->devices->id_release++;
        while (IS_RSRV(rid))
            cv_wait(&krdc->devices->id_rcv,
                &krdc->devices->id_rlock);
        krdc->devices->id_release--;
    }

    /* select underlying device to use */

    if (IS_RSRV(rid)) {
        p = rid;
        if (!raw) {
            other = 1;
        }
    } else if (IS_RSRV(cid)) {
        p = cid;
        if (raw) {
            other = 1;
        }
    }

    /* just increment count and return if already reserved */

    if (p && !RFAILED(p)) {
        if (other) {
            p->bi_orsrv++;
        } else {
            p->bi_rsrv++;
        }

        /* set nsc_fd ownership back link */
        nsc_set_owner(p->bi_fd, krdc->iodev);
        return (0);
    }

    /* attempt reserve */

    if (!p) {
        p = raw ? rid : cid;
    }

    if (!p->bi_fd) {
        /* rpc server raced with rdc_dev_close() */
        return (EIO);
    }
    if ((rc = nsc_reserve(p->bi_fd, 0)) == 0) {
        /*
         * convert failed counts into reserved counts, and add
         * in this reserve.
         */

        p->bi_orsrv = p->bi_ofailed;
        p->bi_rsrv = p->bi_failed;

        if (other) {
            p->bi_orsrv++;
        } else {
            p->bi_rsrv++;
        }

        p->bi_ofailed = 0;
        p->bi_failed = 0;

        /* set nsc_fd ownership back link */

        nsc_set_owner(p->bi_fd, krdc->iodev);
    } else if (rc != EINTR) {
        /*
         * If this is the master, and the secondary is not
         * failed, then just fake this external reserve so that
         * we can do remote io to the secondary and continue to
         * provide service to the client.
         *
         * Subsequent calls to _rdc_rsrv_d() will re-try the
         * nsc_reserve() until it succeeds.
         */

        if ((rdc_get_vflags(urdc) & RDC_PRIMARY) &&
            !(rdc_get_vflags(urdc) & RDC_LOGGING) &&
            !((rdc_get_vflags(urdc) & RDC_SLAVE) &&
            (rdc_get_vflags(urdc) & RDC_SYNCING))) {
            if (!(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) {
                rdc_many_enter(krdc);
                /* Primary, so reverse sync needed */
                rdc_set_mflags(urdc, RDC_RSYNC_NEEDED);
                rdc_set_flags_log(urdc, RDC_VOL_FAILED,
                    "nsc_reserve failed");
                rdc_many_exit(krdc);
                rc = -1;
#ifdef DEBUG
                cmn_err(CE_NOTE, "nsc_reserve failed "
                    "with rc == %d\n", rc);
#endif
            } else {
                rc = 0;
            }

            if (other) {
                p->bi_ofailed++;
            } else {
                p->bi_failed++;
            }

            if (krdc->maxfbas == 0) {
                /*
                 * fake a maxfbas value for remote i/o,
                 * this will get reset when the next
                 * successful reserve happens as part
                 * of the rdc_attach_fd() callback.
                 */
                krdc->maxfbas = 128;
            }
        }
    }

    if (rc == 0 && raw) {
        p->bi_flag = flag;
    }


    return (rc);
}

/*
 * _rdc_rsrv_devs
 *  Reserve named underlying devices.
 *
 */

int
_rdc_rsrv_devs(rdc_k_info_t *krdc, int devs, int flag)
{
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    int write = 0;
    int rc = 0;
    int got = 0;

    if (!krdc) {
        cmn_err(CE_WARN, "rdc: _rdc_rsrv_devs null krdc");
        return (EINVAL);
    }

    ASSERT(!(devs & RDC_CACHE));

    mutex_enter(&krdc->devices->id_rlock);

    if ((devs & (~RDC_BMP)) != 0) {
        if ((rc = _rdc_rsrv_d((devs & RDC_CACHE) == 0,
            &krdc->devices->id_raw_dev, &krdc->devices->id_cache_dev,
            flag, krdc)) != 0) {
            if (rc == -1) {
                /*
                 * we need to call rdc_write_state()
                 * after we drop the mutex
                 */
                write = 1;
                rc = 0;
            } else {
                cmn_err(CE_WARN,
                    "rdc: nsc_reserve(%s) failed %d\n",
                    nsc_pathname(krdc->c_fd), rc);
            }
        } else {
            got |= (devs & (~RDC_BMP));
        }
    }

    if (rc == 0 && (devs & RDC_BMP) != 0) {
        if (krdc->bitmapfd == NULL)
            rc = EIO;
        else if ((krdc->bmaprsrv == 0) &&
            (rc = nsc_reserve(krdc->bitmapfd, 0)) != 0) {
            cmn_err(CE_WARN,
                "rdc: nsc_reserve(%s) failed %d\n",
                nsc_pathname(krdc->bitmapfd), rc);
        } else {
            krdc->bmaprsrv++;
            got |= RDC_BMP;
        }
        if (!RDC_SUCCESS(rc)) {
            /* Undo any previous reserve */
            if (got != 0)
                _rdc_rlse_d(krdc, got);
        }
    }

    mutex_exit(&krdc->devices->id_rlock);

    if (write) {
        rdc_write_state(urdc);
    }

    return (rc);
}


/*
 * Read from the remote end, ensuring that if this is a many group in
 * slave mode that we only remote read from the secondary with the
 * valid data.
 */
int
_rdc_remote_read(rdc_k_info_t *krdc, nsc_buf_t *h, nsc_off_t pos,
    nsc_size_t len, int flag)
{
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    rdc_k_info_t *this = krdc;  /* krdc that was requested */
    int rc;

    if (flag & NSC_RDAHEAD) {
        /*
         * no point in doing readahead remotely,
         * just say we did it ok - the client is about to
         * throw this buffer away as soon as we return.
         */
        return (NSC_DONE);
    }

    /*
     * If this is a many group with a reverse sync in progress and
     * this is not the slave krdc/urdc, then search for the slave
     * so that we can do the remote io from the correct secondary.
     */
    if ((rdc_get_mflags(urdc) & RDC_SLAVE) &&
        !(rdc_get_vflags(urdc) & RDC_SLAVE)) {
        rdc_many_enter(krdc);
        for (krdc = krdc->many_next; krdc != this;
            krdc = krdc->many_next) {
            urdc = &rdc_u_info[krdc->index];
            if (!IS_ENABLED(urdc))
                continue;
            if (rdc_get_vflags(urdc) & RDC_SLAVE)
                break;
        }
        rdc_many_exit(krdc);

        this = krdc;
    }

read1:
    if (rdc_get_vflags(urdc) & RDC_LOGGING) {
        /* cannot do remote io without the remote node! */
        rc = ENETDOWN;
        goto read2;
    }


    /* wait for the remote end to have the latest data */

    if (IS_ASYNC(urdc)) {
        while (krdc->group->ra_queue.blocks != 0) {
            if (!krdc->group->rdc_writer)
                (void) rdc_writer(krdc->index);

            (void) rdc_drain_queue(krdc->index);
        }
    }

    if (krdc->io_kstats) {
        mutex_enter(krdc->io_kstats->ks_lock);
        kstat_runq_enter(KSTAT_IO_PTR(krdc->io_kstats));
        mutex_exit(krdc->io_kstats->ks_lock);
    }

    rc = rdc_net_read(krdc->index, krdc->remote_index, h, pos, len);

    if (krdc->io_kstats) {
        mutex_enter(krdc->io_kstats->ks_lock);
        kstat_runq_exit(KSTAT_IO_PTR(krdc->io_kstats));
        mutex_exit(krdc->io_kstats->ks_lock);
    }

    /* If read error keep trying every secondary until no more */
read2:
    if (!RDC_SUCCESS(rc) && IS_MANY(krdc) &&
        !(rdc_get_mflags(urdc) & RDC_SLAVE)) {
        rdc_many_enter(krdc);
        for (krdc = krdc->many_next; krdc != this;
            krdc = krdc->many_next) {
            urdc = &rdc_u_info[krdc->index];
            if (!IS_ENABLED(urdc))
                continue;
            rdc_many_exit(krdc);
            goto read1;
        }
        rdc_many_exit(krdc);
    }

    return (rc);
}


/*
 * _rdc_alloc_buf
 *  Allocate a buffer of data
 *
 * Calling/Exit State:
 *  Returns NSC_DONE or NSC_HIT for success, NSC_PENDING for async
 *  I/O, > 0 is an error code.
 *
 * Description:
 */
int rdcbufs = 0;

static int
_rdc_alloc_buf(rdc_fd_t *rfd, nsc_off_t pos, nsc_size_t len, int flag,
    rdc_buf_t **ptr)
{
    rdc_k_info_t *krdc = rfd->rdc_info;
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    nsc_vec_t *vec = NULL;
    rdc_buf_t *h;
    size_t size;
    int ioflag;
    int rc = 0;

    if (RDC_IS_BMP(rfd) || RDC_IS_QUE(rfd))
        return (EIO);

    if (len == 0)
        return (EINVAL);

    if (flag & NSC_WRBUF) {

        if (!(rdc_get_vflags(urdc) & RDC_PRIMARY) &&
            !(rdc_get_vflags(urdc) & RDC_LOGGING)) {
            /*
             * Forbid writes to secondary unless logging.
             */
            return (EIO);
        }
    }

    if (!(rdc_get_vflags(urdc) & RDC_PRIMARY) &&
        (rdc_get_vflags(urdc) & RDC_SYNC_NEEDED)) {
        /*
         * Forbid any io to secondary if it needs a sync.
         */
        return (EIO);
    }

    if ((rdc_get_vflags(urdc) & RDC_PRIMARY) &&
        (rdc_get_vflags(urdc) & RDC_RSYNC_NEEDED) &&
        !(rdc_get_vflags(urdc) & RDC_VOL_FAILED) &&
        !(rdc_get_vflags(urdc) & RDC_SLAVE)) {
        /*
         * Forbid any io to primary if it needs a reverse sync
         * and is not actively syncing.
         */
        return (EIO);
    }

    /* Bounds checking */
    ASSERT(urdc->volume_size != 0);
    if (pos + len > urdc->volume_size) {
#ifdef DEBUG
        cmn_err(CE_NOTE,
                "rdc: Attempt to access beyond end of rdc volume");
#endif
        return (EIO);
    }

    h = *ptr;
    if (h == NULL) {
        /* should never happen (nsctl does this for us) */
#ifdef DEBUG
        cmn_err(CE_WARN, "_rdc_alloc_buf entered without buffer!");
#endif
        h = (rdc_buf_t *)_rdc_alloc_handle(NULL, NULL, NULL, rfd);
        if (h == NULL)
            return (ENOMEM);

        h->rdc_bufh.sb_flag &= ~NSC_HALLOCATED;
        *ptr = h;
    }

    if (flag & NSC_NOBLOCK) {
        cmn_err(CE_WARN,
            "_rdc_alloc_buf: removing unsupported NSC_NOBLOCK flag");
        flag &= ~(NSC_NOBLOCK);
    }

    h->rdc_bufh.sb_error = 0;
    h->rdc_bufh.sb_flag |= flag;
    h->rdc_bufh.sb_pos = pos;
    h->rdc_bufh.sb_len = len;
    ioflag = flag;

    bzero(&h->rdc_sync, sizeof (h->rdc_sync));
    mutex_init(&h->rdc_sync.lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&h->rdc_sync.cv, NULL, CV_DRIVER, NULL);

    if (flag & NSC_WRBUF)
        _rdc_async_throttle(krdc, len); /* throttle incoming io */

    /*
     * Use remote io when:
     * - local volume is failed
     * - reserve status is failed
     */
    if ((rdc_get_vflags(urdc) & RDC_VOL_FAILED) || IS_RFAILED(krdc)) {
        rc = EIO;
    } else {
        rc = nsc_alloc_buf(RDC_U_FD(krdc), pos, len,
            ioflag, &h->rdc_bufp);
        if (!RDC_SUCCESS(rc)) {
            rdc_many_enter(krdc);
            if (rdc_get_vflags(urdc) & RDC_PRIMARY) {
                /* Primary, so reverse sync needed */
                rdc_set_mflags(urdc, RDC_RSYNC_NEEDED);
            } else {
                /* Secondary, so forward sync needed */
                rdc_set_flags(urdc, RDC_SYNC_NEEDED);
            }
            rdc_set_flags_log(urdc, RDC_VOL_FAILED,
                "nsc_alloc_buf failed");
            rdc_many_exit(krdc);
            rdc_write_state(urdc);
        }
    }

    if (RDC_SUCCESS(rc)) {
        h->rdc_bufh.sb_vec = h->rdc_bufp->sb_vec;
        h->rdc_flags |= RDC_ALLOC;

        /*
         * If in slave and reading data, remote read on top of
         * the buffer to ensure that we have the latest data.
         */
        if ((flag & NSC_READ) &&
            (rdc_get_vflags(urdc) & RDC_PRIMARY) &&
            (rdc_get_mflags(urdc) & RDC_SLAVE)) {
            rc = _rdc_remote_read(krdc, &h->rdc_bufh,
                pos, len, flag);
            /*
             * Set NSC_MIXED so that the
             * cache will throw away this buffer when we free
             * it since we have combined data from multiple
             * sources into a single buffer.
             */
            h->rdc_bufp->sb_flag |= NSC_MIXED;
        }
    }

    /*
     * If nsc_alloc_buf above fails, or local volume is failed or
     * bitmap is failed or reserve, then we fill the buf from remote
     */

    if ((!RDC_SUCCESS(rc)) && (rdc_get_vflags(urdc) & RDC_PRIMARY) &&
        !(rdc_get_vflags(urdc) & RDC_LOGGING)) {
        if (flag & NSC_NODATA) {
            ASSERT(!(flag & NSC_READ));
            h->rdc_flags |= RDC_REMOTE_BUF;
            h->rdc_bufh.sb_vec = NULL;
        } else {
            size = sizeof (nsc_vec_t) * 2;
            h->rdc_vsize = size + FBA_SIZE(len);
            vec = kmem_zalloc(h->rdc_vsize, KM_SLEEP);

            if (!vec) {
                rc = ENOMEM;
                goto error;
            }

            /* single flat buffer */

            vec[0].sv_addr = (uchar_t *)vec + size;
            vec[0].sv_len  = FBA_SIZE(len);
            vec[0].sv_vme  = 0;

            /* null terminator */

            vec[1].sv_addr = NULL;
            vec[1].sv_len  = 0;
            vec[1].sv_vme  = 0;

            h->rdc_bufh.sb_vec = vec;
            h->rdc_flags |= RDC_REMOTE_BUF;
            h->rdc_flags |= RDC_VEC_ALLOC;
        }

        if (flag & NSC_READ) {
            rc = _rdc_remote_read(krdc, &h->rdc_bufh,
                pos, len, flag);
        } else {
            rc = NSC_DONE;
        }
    }
error:
    if (!RDC_SUCCESS(rc)) {
        h->rdc_bufh.sb_error = rc;
    }

    return (rc);
}


/*
 * _rdc_free_buf
 */

static int
_rdc_free_buf(rdc_buf_t *h)
{
    int rc = 0;

    if (h->rdc_flags & RDC_ALLOC) {
        if (h->rdc_bufp) {
            rc = nsc_free_buf(h->rdc_bufp);
        }
        h->rdc_flags &= ~(RDC_ALLOC);

        if (!RDC_SUCCESS(rc)) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "_rdc_free_buf(%p): nsc_free_buf(%p) returned %d",
                (void *) h, (void *) h->rdc_bufp, rc);
#endif
            return (rc);
        }
    }

    if (h->rdc_flags & (RDC_REMOTE_BUF|RDC_VEC_ALLOC)) {
        if (h->rdc_flags & RDC_VEC_ALLOC) {
            kmem_free(h->rdc_bufh.sb_vec, h->rdc_vsize);
        }
        h->rdc_flags &= ~(RDC_REMOTE_BUF|RDC_VEC_ALLOC);
    }

    if (h->rdc_anon) {
        /* anon buffers still pending */
        DTRACE_PROBE1(rdc_free_buf_err, aio_buf_t, h->rdc_anon);
    }

    if ((h->rdc_bufh.sb_flag & NSC_HALLOCATED) == 0) {
        rc = _rdc_free_handle(h, h->rdc_fd);
        if (!RDC_SUCCESS(rc)) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "_rdc_free_buf(%p): _rdc_free_handle returned %d",
                (void *) h, rc);
#endif
            return (rc);
        }
    } else {
        h->rdc_bufh.sb_flag = NSC_HALLOCATED;
        h->rdc_bufh.sb_vec = NULL;
        h->rdc_bufh.sb_error = 0;
        h->rdc_bufh.sb_pos = 0;
        h->rdc_bufh.sb_len = 0;
        h->rdc_anon = NULL;
        h->rdc_vsize = 0;

        cv_destroy(&h->rdc_sync.cv);
        mutex_destroy(&h->rdc_sync.lock);

    }

    return (0);
}


/*
 * _rdc_open
 *  Open a device
 *
 * Calling/Exit State:
 *  Returns a token to identify the device.
 *
 * Description:
 *  Performs the housekeeping operations associated with an upper layer
 *  of the nsctl stack opening a device.
 */

/* ARGSUSED */

static int
_rdc_open(char *path, int flag, blind_t *cdp, nsc_iodev_t *iodev)
{
    rdc_k_info_t *krdc;
#ifdef DEBUG
    rdc_u_info_t *urdc;
#endif
    rdc_fd_t *rfd;
    int raw = ((flag & NSC_CACHE) == 0);
    int index;
    int bmp = 0;
    int queue = 0;

    rfd = kmem_zalloc(sizeof (*rfd), KM_SLEEP);
    if (!rfd)
        return (ENOMEM);

    /*
     * Take config lock to prevent a race with the
     * (de)configuration code.
     */

    mutex_enter(&rdc_conf_lock);

    index = rdc_lookup_enabled(path, 0);
    if (index < 0) {
        index = rdc_lookup_bitmap(path);
        if (index >= 0)
            bmp = 1;
    }
    if (index < 0) {
        index = rdc_lookup_diskq(path);
        if (index >= 0)
            queue = 1;
    }
    if (index < 0) {
        /* not found in config */
        mutex_exit(&rdc_conf_lock);
        kmem_free(rfd, sizeof (*rfd));
        return (ENXIO);
    }
#ifdef DEBUG
    urdc = &rdc_u_info[index];
#endif
    krdc = &rdc_k_info[index];

    mutex_exit(&rdc_conf_lock);

    rdc_group_enter(krdc);

    ASSERT(IS_ENABLED(urdc));

    if (bmp) {
        krdc->b_ref++;
    } else if (raw) {
        krdc->r_ref++;
    } else if (!queue) {
        krdc->c_ref++;
    }

    rfd->rdc_info = krdc;
    if (bmp)
        rfd->rdc_type = RDC_BMP;
    else if (queue)
        rfd->rdc_type = RDC_QUE;
    else
        rfd->rdc_oflags = flag;

    rdc_group_exit(krdc);

    *cdp = (blind_t)rfd;

    return (0);
}

static int
_rdc_openc(char *path, int flag, blind_t *cdp, nsc_iodev_t *iodev)
{
    return (_rdc_open(path, NSC_CACHE|flag, cdp, iodev));
}

static int
_rdc_openr(char *path, int flag, blind_t *cdp, nsc_iodev_t *iodev)
{
    return (_rdc_open(path, NSC_DEVICE|flag, cdp, iodev));
}


/*
 * _rdc_close
 *  Close a device
 *
 * Calling/Exit State:
 *  Always succeeds - returns 0
 *
 * Description:
 *  Performs the housekeeping operations associated with an upper layer
 *  of the sd stack closing a shadowed device.
 */

static int
_rdc_close(rfd)
rdc_fd_t *rfd;
{
    rdc_k_info_t *krdc = rfd->rdc_info;
    int bmp = RDC_IS_BMP(rfd);
    int raw = RDC_IS_RAW(rfd);
    int queue = RDC_IS_QUE(rfd);

    /*
     * we don't keep ref counts for the queue, so skip this stuff.
     * we may not even have a valid krdc at this point
     */
    if (queue)
        goto queue;
    rdc_group_enter(krdc);

    if (bmp) {
        krdc->b_ref--;
    } else if (raw && !queue) {
        krdc->r_ref--;
    } else if (!queue) {
        krdc->c_ref--;
    }

    if (krdc->closing) {
        cv_broadcast(&krdc->closingcv);
    }

    rdc_group_exit(krdc);
queue:
    kmem_free(rfd, sizeof (*rfd));
    return (0);
}

/*
 * _rdc_alloc_handle
 *  Allocate a handle
 *
 */

static nsc_buf_t *
_rdc_alloc_handle(void (*d_cb)(), void (*r_cb)(), void (*w_cb)(), rdc_fd_t *rfd)
{
    rdc_buf_t *h;

    h = kmem_zalloc(sizeof (*h), KM_SLEEP);
    if (!h)
        return (NULL);

    h->rdc_bufp = nsc_alloc_handle(RDC_FD(rfd), d_cb, r_cb, w_cb);
    if (!h->rdc_bufp) {
        if (!IS_RFAILED(rfd->rdc_info)) {
            /*
             * This is a real failure from the io provider below.
             */
            kmem_free(h, sizeof (*h));
            return (NULL);
        } else {
            /* EMPTY */
            /*
             * This is just a failed primary device where
             * we can do remote io to the secondary.
             */
        }
    }

    h->rdc_bufh.sb_flag = NSC_HALLOCATED;
    h->rdc_fd = rfd;
    mutex_init(&h->aio_lock, NULL, MUTEX_DRIVER, NULL);

    return (&h->rdc_bufh);
}


/*
 * _rdc_free_handle
 *  Free a handle
 *
 */

/* ARGSUSED */
static int
_rdc_free_handle(rdc_buf_t *h, rdc_fd_t *rfd)
{
    int rc;

    mutex_destroy(&h->aio_lock);
    if (h->rdc_bufp) {
        rc = nsc_free_handle(h->rdc_bufp);
        if (!RDC_SUCCESS(rc))
            return (rc);
    }
    kmem_free(h, sizeof (rdc_buf_t));
    return (0);
}


/*
 * _rdc_attach
 *  Attach
 *
 * Calling/Exit State:
 *  Returns 0 for success, errno on failure.
 *
 * Description:
 */

static int
_rdc_attach(rdc_fd_t *rfd, nsc_iodev_t *iodev)
{
    rdc_k_info_t *krdc;
    int raw = RDC_IS_RAW(rfd);
    int rc;

    if ((RDC_IS_BMP(rfd)) || RDC_IS_QUE(rfd))
        return (EINVAL);

    krdc = rfd->rdc_info;
    if (krdc == NULL)
        return (EINVAL);

    mutex_enter(&krdc->devices->id_rlock);
    krdc->iodev = iodev;
    mutex_exit(&krdc->devices->id_rlock);

    rc = _rdc_rsrv_devs(krdc, (raw ? RDC_RAW : RDC_CACHE), RDC_EXTERNAL);
    return (rc);
}


/*
 * _rdc_detach
 *  Detach
 *
 * Calling/Exit State:
 *  Returns 0 for success, always succeeds
 *
 * Description:
 */

static int
_rdc_detach(rdc_fd_t *rfd, nsc_iodev_t *iodev)
{
    rdc_k_info_t *krdc = rfd->rdc_info;
    int raw = RDC_IS_RAW(rfd);

    /*
     * Flush the async queue if necessary.
     */

    if (IS_ASYNC(&rdc_u_info[krdc->index]) && !RDC_IS_DISKQ(krdc->group)) {
        int tries = 1;

        while (krdc->group->ra_queue.blocks != 0 && tries--) {
            if (!krdc->group->rdc_writer)
                (void) rdc_writer(krdc->index);

            (void) rdc_drain_queue(krdc->index);
        }

        /* force disgard of possibly blocked flusher threads */
        if (rdc_drain_queue(krdc->index) != 0) {
#ifdef DEBUG
            net_queue *qp = &krdc->group->ra_queue;
#endif
            do {
                mutex_enter(&krdc->group->ra_queue.net_qlock);
                krdc->group->asyncdis = 1;
                cv_broadcast(&krdc->group->asyncqcv);
                mutex_exit(&krdc->group->ra_queue.net_qlock);
                cmn_err(CE_WARN,
    "RDC: async I/O pending and not drained for %s during detach",
                rdc_u_info[krdc->index].primary.file);
#ifdef DEBUG
                cmn_err(CE_WARN,
        "nitems: %" NSC_SZFMT " nblocks: %" NSC_SZFMT
        " head: 0x%p tail: 0x%p",
            qp->nitems, qp->blocks, (void *)qp->net_qhead,
            (void *)qp->net_qtail);
#endif
            } while (krdc->group->rdc_thrnum > 0);
        }
    }

    mutex_enter(&krdc->devices->id_rlock);
    if (krdc->iodev != iodev)
        cmn_err(CE_WARN, "_rdc_detach: iodev mismatch %p : %p",
            (void *) krdc->iodev, (void *) iodev);

    krdc->iodev = NULL;
    mutex_exit(&krdc->devices->id_rlock);

    _rdc_rlse_devs(krdc, (raw ? RDC_RAW : RDC_CACHE));

    return (0);
}

/*
 * _rdc_get_pinned
 *
 * only affects local node.
 */

static int
_rdc_get_pinned(rdc_fd_t *rfd)
{
    return (nsc_get_pinned(RDC_FD(rfd)));
}

/*
 * _rdc_discard_pinned
 *
 * only affects local node.
 */

static int
_rdc_discard_pinned(rdc_fd_t *rfd, nsc_off_t pos, nsc_size_t len)
{
    return (nsc_discard_pinned(RDC_FD(rfd), pos, len));
}

/*
 * _rdc_partsize
 *
 * only affects the local node.
 */

static int
_rdc_partsize(rdc_fd_t *rfd, nsc_size_t *ptr)
{
    rdc_u_info_t *urdc;

    urdc = &rdc_u_info[rfd->rdc_info->index];
    /* Always return saved size */
    ASSERT(urdc->volume_size != 0);
    *ptr = urdc->volume_size;
    return (0);
}

/*
 * _rdc_maxfbas
 *
 * only affects local node
 */

/* ARGSUSED */
static int
_rdc_maxfbas(rdc_fd_t *rfd, int flag, nsc_size_t *ptr)
{
    rdc_k_info_t *krdc = rfd->rdc_info;
    int raw = RDC_IS_RAW(rfd);
    int rtype = raw ? RDC_RAW : RDC_CACHE;
    int rc = 0;

    if (krdc == NULL)
        return (EINVAL);
    if (flag == NSC_RDAHEAD || flag == NSC_CACHEBLK) {
        rc = _rdc_rsrv_devs(krdc, rtype, RDC_INTERNAL);
        if (rc == 0) {
            rc = nsc_maxfbas(RDC_U_FD(krdc), flag, ptr);
            _rdc_rlse_devs(krdc, rtype);
        }
    } else {
        /* Always return saved size */
        ASSERT(krdc->maxfbas != 0);
        *ptr = krdc->maxfbas - 1;
    }

    return (rc);
}

/* ARGSUSED */
static int
_rdc_control(rdc_fd_t *rfd, int cmd, void *ptr, int len)
{
    return (nsc_control(RDC_FD(rfd),  cmd, ptr, len));
}

/*
 * _rdc_attach_fd
 *
 * called by nsctl as part of nsc_reserve() processing when one of
 * SNDR's underlying file descriptors becomes available and metadata
 * should be re-acquired.
 */
static int
_rdc_attach_fd(blind_t arg)
{
    _rdc_info_dev_t *dip = (_rdc_info_dev_t *)arg;
    rdc_k_info_t *krdc;
    rdc_u_info_t *urdc;
    nsc_size_t maxfbas, partsize;
    int rc;

    krdc = dip->bi_krdc;
    urdc = &rdc_u_info[krdc->index];

    if ((rc = nsc_partsize(dip->bi_fd, &partsize)) != 0) {
        cmn_err(CE_WARN,
            "SNDR: cannot get volume size of %s, error %d",
            nsc_pathname(dip->bi_fd), rc);
    } else if (urdc->volume_size == 0 && partsize > 0) {
        /* set volume size for the first time */
        urdc->volume_size = partsize;
    } else if (urdc->volume_size != partsize) {
        /*
         * SNDR cannot yet cope with a volume being resized,
         * so fail it.
         */
        if (!(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) {
            rdc_many_enter(krdc);
            if (rdc_get_vflags(urdc) & RDC_PRIMARY)
                rdc_set_mflags(urdc, RDC_RSYNC_NEEDED);
            else
                rdc_set_mflags(urdc, RDC_SYNC_NEEDED);
            rdc_set_flags_log(urdc, RDC_VOL_FAILED,
                "volume resized");
            rdc_many_exit(krdc);
            rdc_write_state(urdc);
        }

        cmn_err(CE_WARN,
            "SNDR: %s changed size from %" NSC_SZFMT " to %" NSC_SZFMT,
            nsc_pathname(dip->bi_fd), urdc->volume_size, partsize);
    }

    if ((rc = nsc_maxfbas(dip->bi_fd, 0, &maxfbas)) != 0) {
        cmn_err(CE_WARN,
            "SNDR: cannot get max transfer size for %s, error %d",
            nsc_pathname(dip->bi_fd), rc);
    } else if (maxfbas > 0) {
        krdc->maxfbas = min(RDC_MAX_MAXFBAS, maxfbas);
    }

    return (0);
}


/*
 * _rdc_pinned
 *
 * only affects local node
 */

static void
_rdc_pinned(_rdc_info_dev_t *dip, nsc_off_t pos, nsc_size_t len)
{
    nsc_pinned_data(dip->bi_krdc->iodev, pos, len);
}


/*
 * _rdc_unpinned
 *
 * only affects local node.
 */

static void
_rdc_unpinned(_rdc_info_dev_t *dip, nsc_off_t pos, nsc_size_t len)
{
    nsc_unpinned_data(dip->bi_krdc->iodev, pos, len);
}


/*
 * _rdc_read
 *
 * read the specified data into the buffer - go remote if local down,
 * or the remote end has more recent data because an reverse sync is
 * in progress.
 */

static int
_rdc_read(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag)
{
    rdc_k_info_t *krdc = h->rdc_fd->rdc_info;
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    int remote = (RDC_REMOTE(h) || (rdc_get_mflags(urdc) & RDC_SLAVE));
    int rc1, rc2;

    rc1 = rc2 = 0;

    if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) {
        cmn_err(CE_WARN,
            "_rdc_read: bounds check: io(handle) pos %" NSC_XSZFMT
            "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")",
            pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len);
        h->rdc_bufh.sb_error = EINVAL;
        return (h->rdc_bufh.sb_error);
    }

    if (flag & NSC_NOBLOCK) {
        cmn_err(CE_WARN,
            "_rdc_read: removing unsupported NSC_NOBLOCK flag");
        flag &= ~(NSC_NOBLOCK);
    }


    if (!remote) {
        rc1 = nsc_read(h->rdc_bufp, pos, len, flag);
    }

    if (remote || !RDC_SUCCESS(rc1)) {
        rc2 = _rdc_remote_read(krdc, &h->rdc_bufh, pos, len, flag);
    }

    if (remote && !RDC_SUCCESS(rc2))
        h->rdc_bufh.sb_error = rc2;
    else if (!RDC_SUCCESS(rc1) && !RDC_SUCCESS(rc2))
        h->rdc_bufh.sb_error = rc1;

    return (h->rdc_bufh.sb_error);
}


static int
_rdc_remote_write(rdc_k_info_t *krdc, rdc_buf_t *h, nsc_buf_t *nsc_h,
    nsc_off_t pos, nsc_size_t len, int flag, uint_t bitmask)
{
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    int rc = 0;
    nsc_size_t plen, syncblockpos;
    aio_buf_t *anon = NULL;

    if (!(rdc_get_vflags(urdc) & RDC_PRIMARY))
        return (EINVAL);

    if ((rdc_get_vflags(urdc) & RDC_LOGGING) &&
        (!IS_STATE(urdc, RDC_QUEUING))) {
        goto done;
    }

    /*
     * this check for RDC_SYNCING may seem redundant, but there is a window
     * in rdc_sync, where an async set has not yet been transformed into a
     * sync set.
     */
    if ((!IS_ASYNC(urdc) || IS_STATE(urdc, RDC_SYNCING)) ||
        RDC_REMOTE(h) ||
        krdc->group->synccount > 0 ||
        (rdc_get_vflags(urdc) & RDC_SLAVE) ||
        (rdc_get_vflags(urdc) & RDC_VOL_FAILED) ||
        (rdc_get_vflags(urdc) & RDC_BMP_FAILED)) {

        /* sync mode, or remote io mode, or local device is dead */
        rc = rdc_net_write(krdc->index, krdc->remote_index,
            nsc_h, pos, len, RDC_NOSEQ, RDC_NOQUE, NULL);

        if ((rc == 0) &&
            !(rdc_get_vflags(urdc) & RDC_BMP_FAILED) &&
            !(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) {
            if (IS_STATE(urdc, RDC_SYNCING) &&
                !IS_STATE(urdc, RDC_FULL) ||
                !IS_STATE(urdc, RDC_SLAVE)) {
                mutex_enter(&krdc->syncbitmutex);

                syncblockpos = LOG_TO_FBA_NUM(krdc->syncbitpos);

                DTRACE_PROBE4(rdc_remote_write,
                    nsc_off_t, krdc->syncbitpos,
                    nsc_off_t, syncblockpos,
                    nsc_off_t, pos,
                    nsc_size_t, len);

                /*
                 * If the current I/O's position plus length is
                 * greater then the sync block position, only
                 * clear those blocks upto sync block position
                 */
                if (pos < syncblockpos) {
                    if ((pos + len) > syncblockpos)
                        plen = syncblockpos - pos;
                    else
                        plen = len;
                    RDC_CLR_BITMAP(krdc, pos, plen, bitmask,
                        RDC_BIT_BUMP);
                }
                mutex_exit(&krdc->syncbitmutex);
            } else {
                RDC_CLR_BITMAP(krdc, pos, len, bitmask,
                    RDC_BIT_BUMP);
            }
        } else if (rc != 0) {
            rdc_group_enter(krdc);
            rdc_set_flags_log(urdc, RDC_LOGGING,
                "net write failed");
            rdc_write_state(urdc);
            if (rdc_get_vflags(urdc) & RDC_SYNCING)
                krdc->disk_status = 1;
            rdc_group_exit(krdc);
        }
    } else if (!IS_STATE(urdc, RDC_SYNCING)) {
        DTRACE_PROBE1(async_enque_start, rdc_buf_t *, h);

        ASSERT(krdc->group->synccount == 0);
        /* async mode */
        if ((h == NULL) || ((h->rdc_flags & RDC_ASYNC_VEC) == 0)) {

            rc = _rdc_enqueue_write(krdc, pos, len, flag, NULL);

        } else {
            anon = rdc_aio_buf_get(h, krdc->index);
            if (anon == NULL) {
#ifdef DEBUG
                cmn_err(CE_WARN,
                    "enqueue write failed for handle %p",
                    (void *) h);
#endif
                return (EINVAL);
            }
            rc = _rdc_enqueue_write(krdc, pos, len, flag,
                anon->rdc_abufp);

            /*
             * get rid of the aio_buf_t now, as this
             * may not be the set that this rdc_buf
             * was allocated on, we are done with it anyways
             * enqueuing code frees the nsc_abuf
             */
            rdc_aio_buf_del(h, krdc);
        }

    } else {
        ASSERT(IS_STATE(urdc, RDC_SYNCING));
        ASSERT(0);
    }

done:
    if ((anon == NULL) && h && (h->rdc_flags & RDC_ASYNC_VEC)) {
        /*
         * Toss the anonymous buffer if we have one allocated.
         */
        anon = rdc_aio_buf_get(h, krdc->index);
        if (anon) {
            (void) nsc_free_buf(anon->rdc_abufp);
            rdc_aio_buf_del(h, krdc);
        }
    }

    return (rc);
}

/*
 * _rdc_multi_write
 *
 * Send to multihop remote. Obeys 1 to many if present and we are crazy
 * enough to support it.
 *
 */
int
_rdc_multi_write(nsc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag,
    rdc_k_info_t *krdc)
{
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    rdc_k_info_t *this = krdc;  /* krdc that was requested */
    int rc, retval;
    uint_t bitmask;

    retval = rc = 0;
    if (!RDC_HANDLE_LIMITS(h, pos, len)) {
        cmn_err(CE_WARN,
        "_rdc_multi_write: bounds check: io(handle) pos %" NSC_XSZFMT
        "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")",
            pos, h->sb_pos, len, h->sb_len);
        return (EINVAL);
    }

    /* if this is a 1 to many, set all the bits for all the sets */
    do {
        if (RDC_SET_BITMAP(krdc, pos, len, &bitmask) < 0) {
            (void) nsc_uncommit(h, pos, len, flag);
            /* set the error, but try other sets */
            retval = EIO;
        }
        if (IS_MANY(krdc) && IS_STATE(urdc, RDC_PRIMARY)) {
            rdc_many_enter(krdc);
            for (krdc = krdc->many_next; krdc != this;
                krdc = krdc->many_next) {
                urdc = &rdc_u_info[krdc->index];
                if (!IS_ENABLED(urdc))
                    continue;
                break;
            }
            rdc_many_exit(krdc);
        }
    } while (krdc != this);

    urdc = &rdc_u_info[krdc->index];

    if (flag & NSC_NOBLOCK) {
        cmn_err(CE_WARN,
            "_rdc_multi_write: removing unsupported NSC_NOBLOCK flag");
        flag &= ~(NSC_NOBLOCK);
    }

multiwrite1:
    if ((rdc_get_vflags(urdc) & RDC_PRIMARY) &&
        (!IS_STATE(urdc, RDC_LOGGING) ||
        (IS_STATE(urdc, RDC_LOGGING) &&
        IS_STATE(urdc, RDC_QUEUING)))) {
        rc = _rdc_remote_write(krdc, NULL, h, pos, len, flag, bitmask);
    }

    if (!RDC_SUCCESS(rc) && retval == 0) {
        retval = rc;
    }

multiwrite2:
    if (IS_MANY(krdc) && (rdc_get_vflags(urdc) && RDC_PRIMARY)) {
        rdc_many_enter(krdc);
        for (krdc = krdc->many_next; krdc != this;
            krdc = krdc->many_next) {
            urdc = &rdc_u_info[krdc->index];
            if (!IS_ENABLED(urdc))
                continue;
            rc = 0;
            rdc_many_exit(krdc);

            goto multiwrite1;
        }
        rdc_many_exit(krdc);
    }

    return (retval);
}

void
_rdc_diskq_enqueue_thr(rdc_aio_t *p)
{
    rdc_thrsync_t *sync = (rdc_thrsync_t *)p->next;
    rdc_k_info_t *krdc = &rdc_k_info[p->index];
    int rc2;


    rc2 = rdc_diskq_enqueue(krdc, p);

    /*
     * overload flag with error return if any
     */
    if (!RDC_SUCCESS(rc2)) {
        p->flag = rc2;
    } else {
        p->flag = 0;
    }
    mutex_enter(&sync->lock);
    sync->complete++;
    cv_broadcast(&sync->cv);
    mutex_exit(&sync->lock);
}

/*
 * _rdc_sync_write_thr
 * syncronous write thread which writes to network while
 * local write is occuring
 */
void
_rdc_sync_write_thr(rdc_aio_t *p)
{
    rdc_thrsync_t *sync = (rdc_thrsync_t *)p->next;
    rdc_buf_t *h = (rdc_buf_t *)p->handle;
    rdc_k_info_t *krdc = &rdc_k_info[p->index];
#ifdef  DEBUG
    rdc_u_info_t *urdc;
#endif
    int rc2;
    int bitmask;

    rdc_group_enter(krdc);
    krdc->aux_state |= RDC_AUXWRITE;
#ifdef  DEBUG
    urdc = &rdc_u_info[krdc->index];
    if (!IS_ENABLED(urdc)) {
        cmn_err(CE_WARN, "rdc_sync_write_thr: set not enabled %s:%s",
            urdc->secondary.file,
            urdc->secondary.bitmap);
    }
#endif
    rdc_group_exit(krdc);
    bitmask = p->iostatus;  /* overload */
    rc2 = _rdc_remote_write(krdc, h, &h->rdc_bufh, p->pos, p->len,
        p->flag, bitmask);


    /*
     * overload flag with error return if any
     */
    if (!RDC_SUCCESS(rc2)) {
        p->flag = rc2;
    } else {
        p->flag = 0;
    }

    rdc_group_enter(krdc);
    krdc->aux_state &= ~RDC_AUXWRITE;
    rdc_group_exit(krdc);

    mutex_enter(&sync->lock);
    sync->complete++;
    cv_broadcast(&sync->cv);
    mutex_exit(&sync->lock);
}

/*
 * _rdc_write
 *
 * Commit changes to the buffer locally and send remote.
 *
 * If this write is whilst the local primary volume is being synced,
 * then we write the remote end first to ensure that the new data
 * cannot be overwritten by a concurrent sync operation.
 */

static int
_rdc_write(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag)
{
    rdc_k_info_t *krdc = h->rdc_fd->rdc_info;
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    rdc_k_info_t *this;
    rdc_k_info_t *multi = NULL;
    int remote = RDC_REMOTE(h);
    int rc1, rc2;
    uint_t bitmask;
    int first;
    int rsync;
    int nthr;
    int winddown;
    int thrrc = 0;
    rdc_aio_t *bp[SNDR_MAXTHREADS];
    aio_buf_t *anon;
    nsthread_t  *tp;
    rdc_thrsync_t *sync = &h->rdc_sync;

    /* If this is the multi-hop secondary, move along to the primary */
    if (IS_MULTI(krdc) && !IS_PRIMARY(urdc)) {
        multi = krdc;
        krdc = krdc->multi_next;
        urdc = &rdc_u_info[krdc->index];

        if (!IS_ENABLED(urdc)) {
            krdc = h->rdc_fd->rdc_info;
            urdc = &rdc_u_info[krdc->index];
            multi = NULL;
        }
    }
    this = krdc;

    rsync = (IS_PRIMARY(urdc)) && (IS_SLAVE(urdc));

    /*
     * If this is a many group with a reverse sync in progress and
     * this is not the slave krdc/urdc, then search for the slave
     * so that we can do the remote io to the correct secondary
     * before the local io.
     */
    if (rsync && !(IS_SLAVE(urdc))) {
        rdc_many_enter(krdc);
        for (krdc = krdc->many_next; krdc != this;
            krdc = krdc->many_next) {
            urdc = &rdc_u_info[krdc->index];
            if (!IS_ENABLED(urdc))
                continue;
            if (rdc_get_vflags(urdc) & RDC_SLAVE)
                break;
        }
        rdc_many_exit(krdc);

        this = krdc;
    }

    urdc = &rdc_u_info[krdc->index];

    rc1 = rc2 = 0;
    first = 1;
    nthr = 0;
    if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) {
        cmn_err(CE_WARN,
        "_rdc_write: bounds check: io(handle) pos %" NSC_XSZFMT
        "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")",
            pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len);
        h->rdc_bufh.sb_error = EINVAL;
        return (h->rdc_bufh.sb_error);
    }

    DTRACE_PROBE(rdc_write_bitmap_start);

    /* if this is a 1 to many, set all the bits for all the sets */
    do {
        if (RDC_SET_BITMAP(krdc, pos, len, &bitmask) < 0) {
            if (rdc_eio_nobmp) {
                (void) nsc_uncommit(h->rdc_bufp, pos, len, flag);
                /* set the error, but try the other sets */
                h->rdc_bufh.sb_error = EIO;
            }
        }

        if (IS_MANY(krdc) && IS_STATE(urdc, RDC_PRIMARY)) {
            rdc_many_enter(krdc);
            for (krdc = krdc->many_next; krdc != this;
                krdc = krdc->many_next) {
                urdc = &rdc_u_info[krdc->index];
                if (!IS_ENABLED(urdc))
                    continue;
                break;
            }
            rdc_many_exit(krdc);
        }

    } while (krdc != this);

    urdc = &rdc_u_info[krdc->index];

    DTRACE_PROBE(rdc_write_bitmap_end);

write1:
    /* just in case we switch mode during write */
    if (IS_ASYNC(urdc) && (!IS_STATE(urdc, RDC_SYNCING)) &&
        (!IS_STATE(urdc, RDC_LOGGING) ||
        IS_STATE(urdc, RDC_QUEUING))) {
        h->rdc_flags |= RDC_ASYNC_BUF;
    }
    if (BUF_IS_ASYNC(h)) {
        /*
         * We are async mode
         */
        aio_buf_t *p;
        DTRACE_PROBE(rdc_write_async_start);

        if ((krdc->type_flag & RDC_DISABLEPEND) ||
            ((IS_STATE(urdc, RDC_LOGGING) &&
            !IS_STATE(urdc, RDC_QUEUING)))) {
            goto localwrite;
        }
        if (IS_STATE(urdc, RDC_VOL_FAILED)) {
            /*
             * overload remote as we don't want to do local
             * IO later. forge ahead with async
             */
            remote++;
        }
        if ((IS_STATE(urdc, RDC_SYNCING)) ||
            (IS_STATE(urdc, RDC_LOGGING) &&
            !IS_STATE(urdc, RDC_QUEUING))) {
            goto localwrite;
        }

        p = rdc_aio_buf_add(krdc->index, h);
        if (p == NULL) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "rdc_alloc_buf  aio_buf allocation failed");
#endif
            goto localwrite;
        }

        mutex_enter(&h->aio_lock);

        DTRACE_PROBE(rdc_write_async__allocabuf_start);
        rc1 = nsc_alloc_abuf(pos, len, 0, &p->rdc_abufp);
        DTRACE_PROBE(rdc_write_async__allocabuf_end);
        if (!RDC_SUCCESS(rc1)) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "rdc_alloc_buf NSC_ANON allocation failed"
                " rc %d",
                rc1);
#endif
            mutex_exit(&h->aio_lock);
            goto localwrite;
        }
        h->rdc_flags |= RDC_ASYNC_VEC;
        mutex_exit(&h->aio_lock);

        /*
         * Copy buffer into anonymous buffer
         */

        DTRACE_PROBE(rdc_write_async_nsccopy_start);
        rc1 =
            nsc_copy(&h->rdc_bufh, p->rdc_abufp, pos, pos, len);
        DTRACE_PROBE(rdc_write_async_nsccopy_end);
        if (!RDC_SUCCESS(rc1)) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "_rdc_write: nsc_copy failed rc=%d state %x",
                rc1, rdc_get_vflags(urdc));
#endif
            rc1 = nsc_free_buf(p->rdc_abufp);
            rdc_aio_buf_del(h, krdc);
            rdc_group_enter(krdc);
            rdc_group_log(krdc, RDC_FLUSH|RDC_OTHERREMOTE,
                "nsc_copy failure");
            rdc_group_exit(krdc);
        }
        DTRACE_PROBE(rdc_write_async_end);

        /*
         * using a diskq, launch a thread to queue it
         * and free the aio->h and aio
         * if the thread fails, do it the old way (see localwrite)
         */

        if (RDC_IS_DISKQ(krdc->group)) {

            if (nthr >= SNDR_MAXTHREADS) {
#ifdef DEBUG
                cmn_err(CE_NOTE, "nthr overrun in _rdc_write");
#endif
                thrrc = ENOEXEC;
                goto localwrite;
            }

            anon = rdc_aio_buf_get(h, krdc->index);
            if (anon == NULL) {
#ifdef DEBUG
                cmn_err(CE_WARN, "rdc_aio_buf_get failed for "
                    "%p", (void *)h);
#endif
                thrrc = ENOEXEC;
                goto localwrite;
            }

            /* get a populated rdc_aio_t */
            bp[nthr] =
                rdc_aio_tbuf_get(sync, anon->rdc_abufp, pos, len,
                flag, krdc->index, bitmask);

            if (bp[nthr] == NULL) {
#ifdef DEBUG
                cmn_err(CE_NOTE, "_rdcwrite: "
                    "kmem_alloc failed bp aio (1)");
#endif
                thrrc = ENOEXEC;
                goto localwrite;
            }
            /* start the queue io */
            tp = nst_create(_rdc_ioset, _rdc_diskq_enqueue_thr,
                (void *)bp[nthr], NST_SLEEP);

            if (tp == NULL) {
#ifdef DEBUG
                cmn_err(CE_NOTE,
                    "_rdcwrite: nst_create failure");
#endif
                thrrc = ENOEXEC;
            } else {
                mutex_enter(&(sync->lock));
                sync->threads++;
                mutex_exit(&(sync->lock));
                nthr++;

            }
            /*
             * the handle that is to be enqueued is now in
             * the rdc_aio_t, and will be freed there.
             * dump the aio_t now. If this is 1 to many
             * we may not do this in _rdc_free_buf()
             * if this was not the index that the rdc_buf_t
             * was allocated on.
             */
            rdc_aio_buf_del(h, krdc);

        }
    }   /* end of async */

    /*
     * We try to overlap local and network IO for the sync case
     * (we already do it for async)
     * If one to many, we need to track the resulting nst_thread
     * so we don't trash the nsc_buf on a free
     * Start network IO first then do local (sync only)
     */

    if (IS_PRIMARY(urdc) && !IS_STATE(urdc, RDC_LOGGING) &&
        !BUF_IS_ASYNC(h)) {


        /*
         * if forward syncing, we must do local IO first
         * then remote io. Don't spawn thread
         */
        if (!rsync && (IS_STATE(urdc, RDC_SYNCING))) {
            thrrc = ENOEXEC;
            goto localwrite;
        }
        if (IS_MULTI(krdc)) {
            rdc_k_info_t *ktmp;
            rdc_u_info_t *utmp;

            ktmp = krdc->multi_next;
            utmp = &rdc_u_info[ktmp->index];
            if (IS_ENABLED(utmp))
                multi = ktmp;
        }
        if (nthr >= SNDR_MAXTHREADS) {
#ifdef DEBUG
            cmn_err(CE_NOTE, "nthr overrun in _rdc_write");
#endif
            thrrc = ENOEXEC;
            goto localwrite;
        }

        bp[nthr] = rdc_aio_tbuf_get(sync, h, pos, len,
            flag, krdc->index, bitmask);

        if (bp[nthr] == NULL) {
#ifdef DEBUG
            cmn_err(CE_NOTE, "_rdcwrite: kmem_alloc failed bp aio");
#endif
            thrrc = ENOEXEC;
            goto localwrite;
        }
        tp = nst_create(_rdc_ioset, _rdc_sync_write_thr,
            (void *)bp[nthr], NST_SLEEP);
        if (tp == NULL) {
#ifdef DEBUG
            cmn_err(CE_NOTE,
                "_rdcwrite: nst_create failure");
#endif
            thrrc = ENOEXEC;
        } else {
            mutex_enter(&(sync->lock));
            sync->threads++;
            mutex_exit(&(sync->lock));
            nthr++;
        }
    }
localwrite:
    if (!remote && !rsync && first) {
        DTRACE_PROBE(rdc_write_nscwrite_start);
        rc1 = nsc_write(h->rdc_bufp, pos, len, flag);
        DTRACE_PROBE(rdc_write_nscwrite_end);
        if (!RDC_SUCCESS(rc1)) {
            rdc_many_enter(krdc);
            if (IS_PRIMARY(urdc))
                /* Primary, so reverse sync needed */
                rdc_set_mflags(urdc, RDC_RSYNC_NEEDED);
            else
                /* Secondary, so sync needed */
                rdc_set_flags(urdc, RDC_SYNC_NEEDED);
            rdc_set_flags_log(urdc, RDC_VOL_FAILED,
                "local write failed");
            rdc_many_exit(krdc);
            rdc_write_state(urdc);
        }
    }

    /*
     * This is where we either enqueue async IO for the flusher
     * or do sync IO in the case of an error in thread creation
     * or we are doing a forward sync
     * NOTE: if we are async, and using a diskq, we have
     * already enqueued this write.
     * _rdc_remote_write will end up enqueuueing to memory,
     * or in case of a thread creation error above, try again
     * enqueue the diskq write if thrrc == ENOEXEC
     */
    if ((IS_PRIMARY(urdc)) && (thrrc == ENOEXEC) ||
        (BUF_IS_ASYNC(h) && !RDC_IS_DISKQ(krdc->group))) {
        thrrc = 0;
        if (IS_MULTI(krdc)) {
            rdc_k_info_t *ktmp;
            rdc_u_info_t *utmp;

            ktmp = krdc->multi_next;
            utmp = &rdc_u_info[ktmp->index];
            if (IS_ENABLED(utmp))
                multi = ktmp;
        }

        DTRACE_PROBE(rdc_write_remote_start);

        rc2 = _rdc_remote_write(krdc, h, &h->rdc_bufh,
            pos, len, flag, bitmask);

        DTRACE_PROBE(rdc_rdcwrite_remote_end);
    }

    if (!RDC_SUCCESS(rc1)) {
        if ((IS_PRIMARY(urdc)) && !RDC_SUCCESS(rc2)) {
            h->rdc_bufh.sb_error = rc1;
        }
    } else if ((remote || rsync) && !RDC_SUCCESS(rc2)) {
        h->rdc_bufh.sb_error = rc2;
    }
write2:
    /*
     * If one to many, jump back into the loop to continue IO
     */
    if (IS_MANY(krdc) && (IS_PRIMARY(urdc))) {
        rdc_many_enter(krdc);
        for (krdc = krdc->many_next; krdc != this;
            krdc = krdc->many_next) {
            urdc = &rdc_u_info[krdc->index];
            if (!IS_ENABLED(urdc))
                continue;
            rc2 = first = 0;
            h->rdc_flags &= ~RDC_ASYNC_BUF;
            rdc_many_exit(krdc);
            goto write1;
        }
        rdc_many_exit(krdc);
    }
    urdc = &rdc_u_info[krdc->index];

    /*
     * collect all of our threads if any
     */
    if (nthr) {

        mutex_enter(&(sync->lock));
        /* wait for the threads */
        while (sync->complete != sync->threads) {
            cv_wait(&(sync->cv), &(sync->lock));
        }
        mutex_exit(&(sync->lock));

        /* collect status */

        winddown = 0;
        while (winddown < nthr) {
            /*
             * Get any error return from thread
             */
            if ((remote || rsync) && bp[winddown]->flag) {
                h->rdc_bufh.sb_error =
                    bp[winddown]->flag;
            }
            if (bp[winddown])
                kmem_free(bp[winddown], sizeof (rdc_aio_t));
            winddown++;
        }
    }

    if (rsync && !(IS_STATE(urdc, RDC_VOL_FAILED))) {
        rc1 = nsc_write(h->rdc_bufp, pos, len, flag);
        if (!RDC_SUCCESS(rc1)) {
            /* rsync, so reverse sync needed already set */
            rdc_many_enter(krdc);
            rdc_set_flags_log(urdc, RDC_VOL_FAILED,
                "rsync local write failed");
            rdc_many_exit(krdc);
            rdc_write_state(urdc);

            /*
             * only report the error if a remote error
             * occurred as well.
             */
            if (h->rdc_bufh.sb_error)
                h->rdc_bufh.sb_error = rc1;
        }
    }

    if (multi) {
        /* Multi-hop secondary, just set bits in the bitmap */
        (void) RDC_SET_BITMAP(multi, pos, len, &bitmask);
    }

    return (h->rdc_bufh.sb_error);
}


static void
_rdc_bzero(nsc_buf_t *h, nsc_off_t pos, nsc_size_t len)
{
    nsc_vec_t *v;
    uchar_t *a;
    size_t sz;
    int l;

    if (!RDC_HANDLE_LIMITS(h, pos, len)) {
        cmn_err(CE_WARN,
        "_rdc_bzero: bounds check: io(handle) pos %" NSC_XSZFMT
        "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")",
            pos, h->sb_pos, len, h->sb_len);
        return;
    }

    if (!len)
        return;

    /* find starting point */

    v = h->sb_vec;
    pos -= h->sb_pos;

    for (; pos >= FBA_NUM(v->sv_len); v++)
        pos -= FBA_NUM(v->sv_len);

    a = v->sv_addr + FBA_SIZE(pos);
    l = v->sv_len - FBA_SIZE(pos);

    /* zero */

    len = FBA_SIZE(len);    /* convert to bytes */

    while (len) {
        if (!a)     /* end of vec */
            break;

        sz = (size_t)min((nsc_size_t)l, len);

        bzero(a, sz);

        len -= sz;
        l -= sz;
        a += sz;

        if (!l) {
            v++;
            a = v->sv_addr;
            l = v->sv_len;
        }
    }
}


/*
 * _rdc_zero
 *
 * Zero and commit the specified area of the buffer.
 *
 * If this write is whilst the local primary volume is being synced,
 * then we write the remote end first to ensure that the new data
 * cannot be overwritten by a concurrent sync operation.
 */

static int
_rdc_zero(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag)
{
    rdc_k_info_t *krdc = h->rdc_fd->rdc_info;
    rdc_u_info_t *urdc = &rdc_u_info[krdc->index];
    rdc_k_info_t *this;
    rdc_k_info_t *multi = NULL;
    int remote = RDC_REMOTE(h);
    int rc1, rc2;
    uint_t bitmask;
    int first;
    int rsync;

    /* If this is the multi-hop secondary, move along to the primary */
    if (IS_MULTI(krdc) && !(rdc_get_vflags(urdc) & RDC_PRIMARY)) {
        multi = krdc;
        krdc = krdc->multi_next;
        urdc = &rdc_u_info[krdc->index];

        if (!IS_ENABLED(urdc)) {
            krdc = h->rdc_fd->rdc_info;
            urdc = &rdc_u_info[krdc->index];
            multi = NULL;
        }
    }
    this = krdc;

    rsync = ((rdc_get_vflags(urdc) & RDC_PRIMARY) &&
        (rdc_get_mflags(urdc) & RDC_SLAVE));

    /*
     * If this is a many group with a reverse sync in progress and
     * this is not the slave krdc/urdc, then search for the slave
     * so that we can do the remote io to the correct secondary
     * before the local io.
     */
    if (rsync && !(rdc_get_vflags(urdc) & RDC_SLAVE)) {
        rdc_many_enter(krdc);
        for (krdc = krdc->many_next; krdc != this;
            krdc = krdc->many_next) {
            urdc = &rdc_u_info[krdc->index];
            if (!IS_ENABLED(urdc))
                continue;
            if (rdc_get_vflags(urdc) & RDC_SLAVE)
                break;
        }
        rdc_many_exit(krdc);

        this = krdc;
    }

    rc1 = rc2 = 0;
    first = 1;

    if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) {
        cmn_err(CE_WARN,
            "_rdc_zero: bounds check: io(handle) pos %" NSC_XSZFMT
            "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")",
            pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len);
        h->rdc_bufh.sb_error = EINVAL;
        return (h->rdc_bufh.sb_error);
    }

zero1:
    if (RDC_SET_BITMAP(krdc, pos, len, &bitmask) < 0) {
        (void) nsc_uncommit(h->rdc_bufp, pos, len, flag);
        h->rdc_bufh.sb_error = EIO;
        goto zero2;
    }

    if (IS_ASYNC(urdc)) {
        /*
         * We are async mode
         */
        aio_buf_t *p;

        if ((krdc->type_flag & RDC_DISABLEPEND) ||
            (rdc_get_vflags(urdc) & RDC_LOGGING)) {
            mutex_exit(&krdc->group->ra_queue.net_qlock);
            goto localzero;
        }

        if ((rdc_get_vflags(urdc) & RDC_VOL_FAILED) ||
            (rdc_get_vflags(urdc) & RDC_BMP_FAILED)) {
            mutex_exit(&krdc->group->ra_queue.net_qlock);
            goto zero2;
        }
        if (rdc_get_vflags(urdc) & RDC_LOGGING) {
            mutex_exit(&krdc->group->ra_queue.net_qlock);
            goto localzero;
        }
        p = rdc_aio_buf_add(krdc->index, h);
        if (p == NULL) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "rdc_alloc_buf  aio_buf allocation failed");
#endif
            goto localzero;
        }
        mutex_enter(&h->aio_lock);
        rc1 = nsc_alloc_abuf(pos, len, 0, &p->rdc_abufp);
        if (!RDC_SUCCESS(rc1)) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "rdc_alloc_buf NSC_ANON allocation failed rc %d",
                rc1);
#endif
            mutex_exit(&h->aio_lock);
            goto localzero;
        }
        h->rdc_flags |= RDC_ASYNC_VEC;
        mutex_exit(&h->aio_lock);

        /*
         * Copy buffer into anonymous buffer
         */

        rc1 = nsc_zero(p->rdc_abufp, pos, len, flag);
        if (!RDC_SUCCESS(rc1)) {
#ifdef DEBUG
            cmn_err(CE_WARN,
                "_rdc_zero: nsc_zero failed rc=%d state %x",
                rc1, rdc_get_vflags(urdc));
#endif
            rc1 = nsc_free_buf(p->rdc_abufp);
            rdc_aio_buf_del(h, krdc);
            rdc_group_enter(krdc);
            rdc_group_log(krdc, RDC_FLUSH | RDC_OTHERREMOTE,
                "nsc_zero failed");
            rdc_group_exit(krdc);
        }
    }   /* end of async */

localzero:

    if (flag & NSC_NOBLOCK) {
        cmn_err(CE_WARN,
            "_rdc_zero: removing unsupported NSC_NOBLOCK flag");
        flag &= ~(NSC_NOBLOCK);
    }

    if (!remote && !rsync && first) {
        rc1 = nsc_zero(h->rdc_bufp, pos, len, flag);
        if (!RDC_SUCCESS(rc1)) {
            ASSERT(rdc_get_vflags(urdc) & RDC_PRIMARY);
            rdc_many_enter(krdc);
            /* Primary, so reverse sync needed */
            rdc_set_mflags(urdc, RDC_RSYNC_NEEDED);
            rdc_set_flags_log(urdc, RDC_VOL_FAILED,
                "nsc_zero failed");
            rdc_many_exit(krdc);
            rdc_write_state(urdc);
        }
    }

    /*
     * send new data to remote end - nsc_zero has zero'd
     * the data in the buffer, or _rdc_bzero will be used below.
     */

    if (rdc_get_vflags(urdc) & RDC_PRIMARY) {
        if (first && (remote || rsync || !RDC_SUCCESS(rc1))) {
            /* bzero so that we can send new data to remote node */
            _rdc_bzero(&h->rdc_bufh, pos, len);
        }

        if (IS_MULTI(krdc)) {
            rdc_k_info_t *ktmp;
            rdc_u_info_t *utmp;

            ktmp = krdc->multi_next;
            utmp = &rdc_u_info[ktmp->index];
            if (IS_ENABLED(utmp))
                multi = ktmp;
        }

        rc2 = _rdc_remote_write(krdc, h, &h->rdc_bufh,
            pos, len, flag, bitmask);
    }

    if (!RDC_SUCCESS(rc1)) {
        if ((rdc_get_vflags(urdc) & RDC_PRIMARY) && !RDC_SUCCESS(rc2)) {
            h->rdc_bufh.sb_error = rc1;
        }
    } else if ((remote || rsync) && !RDC_SUCCESS(rc2)) {
        h->rdc_bufh.sb_error = rc2;
    }

zero2:
    if (IS_MANY(krdc) && (rdc_get_vflags(urdc) && RDC_PRIMARY)) {
        rdc_many_enter(krdc);
        for (krdc = krdc->many_next; krdc != this;
            krdc = krdc->many_next) {
            urdc = &rdc_u_info[krdc->index];
            if (!IS_ENABLED(urdc))
                continue;
            rc2 = first = 0;
            rdc_many_exit(krdc);
            goto zero1;
        }
        rdc_many_exit(krdc);
    }

    if (rsync && !(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) {
        rc1 = nsc_write(h->rdc_bufp, pos, len, flag);
        if (!RDC_SUCCESS(rc1)) {
            /* rsync, so reverse sync needed already set */
            rdc_many_enter(krdc);
            rdc_set_flags_log(urdc, RDC_VOL_FAILED,
                "nsc_write failed");
            rdc_many_exit(krdc);
            rdc_write_state(urdc);

            /*
             * only report the error if a remote error
             * occurred as well.
             */
            if (h->rdc_bufh.sb_error)
                h->rdc_bufh.sb_error = rc1;
        }
    }

    if (multi) {
        /* Multi-hop secondary, just set bits in the bitmap */
        (void) RDC_SET_BITMAP(multi, pos, len, &bitmask);
    }

    return (h->rdc_bufh.sb_error);
}


/*
 * _rdc_uncommit
 * - refresh specified data region in the buffer to prevent the cache
 *   serving the scribbled on data back to another client.
 *
 * Only needs to happen on the local node.  If in remote io mode, then
 * just return 0 - we do not cache the data on the local node and the
 * changed data will not have made it to the cache on the other node,
 * so it has no need to uncommit.
 */

static int
_rdc_uncommit(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag)
{
    int remote = RDC_REMOTE(h);
    int rc = 0;

    if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) {
        cmn_err(CE_WARN,
        "_rdc_uncommit: bounds check: io(handle) pos %" NSC_XSZFMT
        "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")",
            pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len);
        h->rdc_bufh.sb_error = EINVAL;
        return (h->rdc_bufh.sb_error);
    }

    if (flag & NSC_NOBLOCK) {
        cmn_err(CE_WARN,
            "_rdc_uncommit: removing unsupported NSC_NOBLOCK flag");
        flag &= ~(NSC_NOBLOCK);
    }

    if (!remote) {
        rc = nsc_uncommit(h->rdc_bufp, pos, len, flag);
    }

    if (!RDC_SUCCESS(rc))
        h->rdc_bufh.sb_error = rc;

    return (rc);
}


/*
 * _rdc_trksize
 *
 * only needs to happen on local node.
 */

static int
_rdc_trksize(rdc_fd_t *rfd, nsc_size_t trksize)
{
    return (nsc_set_trksize(RDC_FD(rfd), trksize));
}


static nsc_def_t _rdc_fd_def[] = {
    "Attach",   (uintptr_t)_rdc_attach_fd,  0,
    "Pinned",   (uintptr_t)_rdc_pinned,     0,
    "Unpinned", (uintptr_t)_rdc_unpinned,   0,
    0,      0,              0
};


static nsc_def_t _rdc_io_def[] = {
    "Open",     (uintptr_t)_rdc_openc,      0,
    "Close",    (uintptr_t)_rdc_close,      0,
    "Attach",   (uintptr_t)_rdc_attach,     0,
    "Detach",   (uintptr_t)_rdc_detach,     0,
    "AllocHandle",  (uintptr_t)_rdc_alloc_handle,   0,
    "FreeHandle",   (uintptr_t)_rdc_free_handle,    0,
    "AllocBuf", (uintptr_t)_rdc_alloc_buf,  0,
    "FreeBuf",  (uintptr_t)_rdc_free_buf,   0,
    "GetPinned",    (uintptr_t)_rdc_get_pinned, 0,
    "Discard",  (uintptr_t)_rdc_discard_pinned, 0,
    "PartSize", (uintptr_t)_rdc_partsize,   0,
    "MaxFbas",  (uintptr_t)_rdc_maxfbas,    0,
    "Control",  (uintptr_t)_rdc_control,    0,
    "Read",     (uintptr_t)_rdc_read,       0,
    "Write",    (uintptr_t)_rdc_write,      0,
    "Zero",     (uintptr_t)_rdc_zero,       0,
    "Uncommit", (uintptr_t)_rdc_uncommit,   0,
    "TrackSize",    (uintptr_t)_rdc_trksize,    0,
    "Provide",  0,              0,
    0,      0,              0
};

static nsc_def_t _rdc_ior_def[] = {
    "Open",     (uintptr_t)_rdc_openr,      0,
    "Close",    (uintptr_t)_rdc_close,      0,
    "Attach",   (uintptr_t)_rdc_attach,     0,
    "Detach",   (uintptr_t)_rdc_detach,     0,
    "AllocHandle",  (uintptr_t)_rdc_alloc_handle,   0,
    "FreeHandle",   (uintptr_t)_rdc_free_handle,    0,
    "AllocBuf", (uintptr_t)_rdc_alloc_buf,  0,
    "FreeBuf",  (uintptr_t)_rdc_free_buf,   0,
    "GetPinned",    (uintptr_t)_rdc_get_pinned, 0,
    "Discard",  (uintptr_t)_rdc_discard_pinned, 0,
    "PartSize", (uintptr_t)_rdc_partsize,   0,
    "MaxFbas",  (uintptr_t)_rdc_maxfbas,    0,
    "Control",  (uintptr_t)_rdc_control,    0,
    "Read",     (uintptr_t)_rdc_read,       0,
    "Write",    (uintptr_t)_rdc_write,      0,
    "Zero",     (uintptr_t)_rdc_zero,       0,
    "Uncommit", (uintptr_t)_rdc_uncommit,   0,
    "TrackSize",    (uintptr_t)_rdc_trksize,    0,
    "Provide",  0,              0,
    0,      0,              0
};