lvm/mirror/mirror.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <sys/t_lock.h>
#include <sys/buf.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/kmem.h>
#include <vm/page.h>
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mkdev.h>
#include <sys/stat.h>
#include <sys/open.h>
#include <sys/modctl.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/debug.h>
#include <sys/dklabel.h>
#include <vm/hat.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_mirror.h>
#include <sys/lvm/md_convert.h>
#include <sys/lvm/md_mddb.h>
#include <sys/esunddi.h>

#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>
#include <sys/lvm/mdmn_commd.h>
#include <sys/avl.h>

md_ops_t        mirror_md_ops;
#ifndef lint
md_ops_t        *md_interface_ops = &mirror_md_ops;
#endif

extern mdq_anchor_t md_done_daemon;
extern mdq_anchor_t md_mstr_daemon;
extern mdq_anchor_t md_mirror_daemon;
extern mdq_anchor_t md_mirror_io_daemon;
extern mdq_anchor_t md_mirror_rs_daemon;
extern mdq_anchor_t md_mhs_daemon;

extern unit_t       md_nunits;
extern set_t        md_nsets;
extern md_set_t     md_set[];

extern int      md_status;
extern clock_t      md_hz;

extern md_krwlock_t md_unit_array_rw;
extern kmutex_t     md_mx;
extern kcondvar_t   md_cv;
extern int      md_mtioctl_cnt;

daemon_request_t    mirror_timeout;
static daemon_request_t hotspare_request;
static daemon_request_t mn_hs_request[MD_MAXSETS];  /* Multinode hs req */

int md_mirror_mcs_buf_off;

/* Flags for mdmn_ksend_message to allow debugging */
int md_mirror_msg_flags;

#ifdef DEBUG
/* Flag to switch on debug messages */
int mirror_debug_flag = 0;
#endif

/*
 * Struct used to hold count of DMR reads and the timestamp of last DMR read
 * It is used to verify, using a debugger, that the DMR read ioctl has been
 * executed.
 */
dmr_stats_t mirror_dmr_stats = {0, 0};

/*
 * Mutex protecting list of non-failfast drivers.
 */
static kmutex_t non_ff_drv_mutex;
extern char **non_ff_drivers;

extern major_t  md_major;

/*
 * Write-On-Write memory pool.
 */
static void     copy_write_cont(wowhdr_t *wowhdr);
static kmem_cache_t *mirror_wowblk_cache = NULL;
static int      md_wowbuf_size = 16384;
static size_t       md_wowblk_size;

/*
 * This is a flag that allows:
 *  - disabling the write-on-write mechanism.
 *  - logging occurrences of write-on-write
 *  - switching wow handling procedure processing
 * Counter for occurences of WOW.
 */
static uint_t   md_mirror_wow_flg = 0;
static int  md_mirror_wow_cnt = 0;

/*
 * Tunable to enable/disable dirty region
 * processing when closing down a mirror.
 */
static int  new_resync = 1;
kmem_cache_t    *mirror_parent_cache = NULL;
kmem_cache_t    *mirror_child_cache = NULL;

extern int  md_ff_disable;      /* disable failfast */

static int  mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
static void mirror_read_strategy(buf_t *, int, void *);
static void mirror_write_strategy(buf_t *, int, void *);
static void become_owner(daemon_queue_t *);
static int  mirror_done(struct buf *cb);
static int  mirror_done_common(struct buf *cb);
static void clear_retry_error(struct buf *cb);

/*
 * patchables
 */
int md_min_rr_size  = 200;  /* 2000 blocks, or 100k */
int md_def_num_rr   = 1000; /* Default number of dirty regions */

/*
 * patchable to change delay before rescheduling mirror ownership request.
 * Value is clock ticks, default 0.5 seconds
 */
clock_t md_mirror_owner_to = 500000;

/*ARGSUSED1*/
static int
mirror_parent_constructor(void *p, void *d1, int d2)
{
    mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
    return (0);
}

static void
mirror_parent_init(md_mps_t *ps)
{
    bzero(ps, offsetof(md_mps_t, ps_mx));
    bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
}

/*ARGSUSED1*/
static void
mirror_parent_destructor(void *p, void *d)
{
    mutex_destroy(&((md_mps_t *)p)->ps_mx);
}

/*ARGSUSED1*/
static int
mirror_child_constructor(void *p, void *d1, int d2)
{
    bioinit(&((md_mcs_t *)p)->cs_buf);
    return (0);
}

void
mirror_child_init(md_mcs_t *cs)
{
    cs->cs_ps = NULL;
    cs->cs_mdunit = 0;
    md_bioreset(&cs->cs_buf);
}

/*ARGSUSED1*/
static void
mirror_child_destructor(void *p, void *d)
{
    biofini(&((md_mcs_t *)p)->cs_buf);
}

static void
mirror_wowblk_init(wowhdr_t *p)
{
    bzero(p, md_wowblk_size);
}

static void
send_poke_hotspares_msg(daemon_request_t *drq)
{
    int         rval;
    int         nretries = 0;
    md_mn_msg_pokehsp_t pokehsp;
    md_mn_kresult_t     *kresult;
    set_t           setno = (set_t)drq->dq.qlen;

    pokehsp.pokehsp_setno = setno;

    kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);

retry_sphmsg:
    rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
        MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
        sizeof (pokehsp), kresult);

    if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
        mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
        /* If we're shutting down already, pause things here. */
        if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
            while (!md_mn_is_commd_present()) {
                delay(md_hz);
            }
            /*
             * commd has become reachable again, so retry once.
             * If this fails we'll panic as the system is in an
             * unexpected state.
             */
            if (nretries++ == 0)
                goto retry_sphmsg;
        }
        cmn_err(CE_PANIC,
            "ksend_message failure: POKE_HOTSPARES");
    }
    kmem_free(kresult, sizeof (md_mn_kresult_t));

    /* Allow further requests to use this set's queue structure */
    mutex_enter(&drq->dr_mx);
    drq->dr_pending = 0;
    mutex_exit(&drq->dr_mx);
}

/*
 * Send a poke_hotspares message to the master node. To avoid swamping the
 * commd handler with requests we only send a message if there is not one
 * already outstanding. We punt the request to a separate thread context as
 * cannot afford to block waiting on the request to be serviced. This is
 * essential when a reconfig cycle is in progress as any open() of a multinode
 * metadevice may result in a livelock.
 */
static void
send_poke_hotspares(set_t setno)
{
    daemon_request_t    *drq = &mn_hs_request[setno];

    mutex_enter(&drq->dr_mx);
    if (drq->dr_pending == 0) {
        drq->dr_pending = 1;
        drq->dq.qlen = (int)setno;
        daemon_request(&md_mhs_daemon,
            send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
    }
    mutex_exit(&drq->dr_mx);
}

void
mirror_set_sm_state(
    mm_submirror_t      *sm,
    mm_submirror_ic_t   *smic,
    sm_state_t      newstate,
    int         force)
{
    int         compcnt;
    int         i;
    int         errcnt;
    sm_state_t      origstate;
    md_m_shared_t       *shared;

    if (force) {
        sm->sm_state = newstate;
        uniqtime32(&sm->sm_timestamp);
        return;
    }

    origstate = newstate;

    compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
    for (i = 0, errcnt = 0; i < compcnt; i++) {
        shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
            (sm->sm_dev, sm, i);
        if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
            newstate |= SMS_COMP_ERRED;
        if (shared->ms_state & (CS_RESYNC))
            newstate |= SMS_COMP_RESYNC;
        if (shared->ms_state & CS_ERRED)
            errcnt++;
    }

    if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
        newstate &= ~origstate;

    if (errcnt == compcnt)
        newstate |= SMS_ALL_ERRED;
    else
        newstate &= ~SMS_ALL_ERRED;

    sm->sm_state = newstate;
    uniqtime32(&sm->sm_timestamp);
}

static int
mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
                            int frm_probe)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    md_m_shared_t       *shared;
    int         ci;
    int         i;
    int         compcnt;
    int         open_comp; /* flag for open component */

    for (i = *smi; i < NMIRROR; i++) {
        sm = &un->un_sm[i];
        smic = &un->un_smic[i];

        if (!SMS_IS(sm, SMS_INUSE))
            continue;

        compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
        for (ci = *cip; ci < compcnt; ci++) {
            shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
                (sm->sm_dev, sm, ci);
            /*
             * if called from any routine but probe, we check for
             * MDM_S_ISOPEN flag. Since probe does a pseduo open,
             * it sets MDM_S_PROBEOPEN flag and we test for this
             * flag. They are both exclusive tests.
             */
            open_comp = (frm_probe) ?
                (shared->ms_flags & MDM_S_PROBEOPEN):
                (shared->ms_flags & MDM_S_ISOPEN);
            if (((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
                ((shared->ms_state == CS_OKAY) ||
                (shared->ms_state == CS_RESYNC))) ||
                (!open_comp &&
                (shared->ms_state == CS_LAST_ERRED))) {
                if (clr_error) {
                    shared->ms_flags &= ~MDM_S_IOERR;
                }
                *cip = ci;
                *smi = i;
                return (1);
            }

            if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
                shared->ms_flags &= ~MDM_S_IOERR;
            }
        }

        *cip = 0;
    }
    return (0);
}

/*ARGSUSED*/
static void
mirror_run_queue(void *d)
{
    if (!(md_status & MD_GBL_DAEMONS_LIVE))
        md_daemon(1, &md_done_daemon);
}
/*
 * check_comp_4_hotspares
 *
 * This function attempts to allocate a hotspare for this component if the
 * component is in error. In a MN set, the function can be called in 2 modes.
 * It can be called either when a component error has been detected or when a
 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
 * in flags and the request is sent to all nodes.
 * The handler on each of the nodes then calls this function with
 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
 *
 * For non-MN sets the function simply attempts to allocate a hotspare.
 *
 * On entry, the following locks are held
 *  mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
 *  md_unit_writerlock
 *
 * Returns  0 if ok
 *      1 if the unit containing the component has been cleared while
 *        the mdmn_ksend_message() was being executed
 */
extern int
check_comp_4_hotspares(
    mm_unit_t   *un,
    int     smi,
    int     ci,
    uint_t      flags,
    mddb_recid_t    hs_id,  /* Only used by MN disksets */
    IOLOCK      *lockp  /* can be NULL */
)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    md_m_shared_t       *shared;
    mddb_recid_t        recids[6];
    minor_t         mnum;
    intptr_t        (*hs_dev)();
    void            (*hs_done)();
    void            *hs_data;
    md_error_t      mde = mdnullerror;
    set_t           setno;
    md_mn_msg_allochsp_t    allochspmsg;
    md_mn_kresult_t     *kresult;
    mm_unit_t       *new_un;
    int         rval;
    int         nretries = 0;

    mnum = MD_SID(un);
    setno = MD_UN2SET(un);
    sm = &un->un_sm[smi];
    smic = &un->un_smic[smi];
    shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
        (sm->sm_dev, sm, ci);

    if (shared->ms_state != CS_ERRED)
        return (0);

    /* Don't start a new component resync if a resync is already running. */
    if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
        return (0);

    if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
        uint_t      msgflags;
        md_mn_msgtype_t msgtype;

        /* Send allocate hotspare message to all nodes */

        allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
        allochspmsg.msg_allochsp_sm = smi;
        allochspmsg.msg_allochsp_comp = ci;
        allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;

        /*
         * Before calling mdmn_ksend_message(), release locks
         * Can never be in the context of an ioctl.
         */
        md_unit_writerexit(MDI_UNIT(mnum));
        if (flags & MD_HOTSPARE_LINKHELD)
            rw_exit(&mirror_md_ops.md_link_rw.lock);
#ifdef DEBUG
        if (mirror_debug_flag)
            printf("send alloc hotspare, flags="
                "0x%x %x, %x, %x, %x\n", flags,
                allochspmsg.msg_allochsp_mnum,
                allochspmsg.msg_allochsp_sm,
                allochspmsg.msg_allochsp_comp,
                allochspmsg.msg_allochsp_hs_id);
#endif
        if (flags & MD_HOTSPARE_WMUPDATE) {
            msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
            /*
             * When coming from an update of watermarks, there
             * must already be a message logged that triggered
             * this action. So, no need to log this message, too.
             */
            msgflags = MD_MSGF_NO_LOG;
        } else {
            msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
            msgflags = MD_MSGF_DEFAULT_FLAGS;
        }

        kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);

cc4hs_msg:
        rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
            (char *)&allochspmsg, sizeof (allochspmsg),
            kresult);

        if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
#ifdef DEBUG
            if (mirror_debug_flag)
                mdmn_ksend_show_error(rval, kresult,
                    "ALLOCATE HOTSPARE");
#endif
            /*
             * If message is sent ok but exitval indicates an error
             * it must be because the mirror has been cleared. In
             * this case re-obtain lock and return an error
             */
            if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
                if (flags & MD_HOTSPARE_LINKHELD) {
                    rw_enter(&mirror_md_ops.md_link_rw.lock,
                        RW_READER);
                }
                kmem_free(kresult, sizeof (md_mn_kresult_t));
                return (1);
            }
            /* If we're shutting down already, pause things here. */
            if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
                while (!md_mn_is_commd_present()) {
                    delay(md_hz);
                }
                /*
                 * commd has become reachable again, so retry
                 * once. If this fails we'll panic as the
                 * system is in an unexpected state.
                 */
                if (nretries++ == 0)
                    goto cc4hs_msg;
            }
            cmn_err(CE_PANIC,
                "ksend_message failure: ALLOCATE_HOTSPARE");
        }
        kmem_free(kresult, sizeof (md_mn_kresult_t));

        /*
         * re-obtain the locks
         */
        if (flags & MD_HOTSPARE_LINKHELD)
            rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
        new_un = md_unit_writerlock(MDI_UNIT(mnum));

        /*
         * As we had to release the locks in order to send the
         * message to all nodes, we need to check to see if the
         * unit has changed. If it has we release the writerlock
         * and return fail.
         */
        if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
            md_unit_writerexit(MDI_UNIT(mnum));
            return (1);
        }
    } else {
        if (MD_MNSET_SETNO(setno)) {
            /*
             * If 2 or more nodes simultaneously see a
             * component failure, these nodes will each
             * send an ALLOCATE_HOTSPARE[2] message.
             * The first message will allocate the hotspare
             * and the subsequent messages should do nothing.
             *
             * If a slave node doesn't have a hotspare allocated
             * at the time the message is initiated, then the
             * passed in hs_id will be 0.  If the node
             * executing this routine has a component shared
             * ms_hs_id of non-zero, but the message shows a
             * hs_id of 0, then just return since a hotspare
             * has already been allocated for this failing
             * component.  When the slave node returns from
             * the ksend_message the hotspare will have
             * already been allocated.
             *
             * If the slave node does send an hs_id of non-zero,
             * and the slave node's hs_id matches this node's
             * ms_hs_id, then the hotspare has error'd and
             * should be replaced.
             *
             * If the slave node sends an hs_id of non-zero and
             * this node has a different shared ms_hs_id, then
             * just return since this hotspare has already
             * been hotspared.
             */
            if (shared->ms_hs_id != 0) {
                if (hs_id == 0) {
#ifdef DEBUG
                    if (mirror_debug_flag) {
                        printf("check_comp_4_hotspares"
                            "(NOXMIT), short circuit "
                            "hs_id=0x%x, "
                            "ms_hs_id=0x%x\n",
                            hs_id, shared->ms_hs_id);
                    }
#endif
                    return (0);
                }
                if (hs_id != shared->ms_hs_id) {
#ifdef DEBUG
                    if (mirror_debug_flag) {
                        printf("check_comp_4_hotspares"
                            "(NOXMIT), short circuit2 "
                            "hs_id=0x%x, "
                            "ms_hs_id=0x%x\n",
                            hs_id, shared->ms_hs_id);
                    }
#endif
                    return (0);
                }
            }
        }

        sm = &un->un_sm[smi];
        hs_dev = md_get_named_service(sm->sm_dev, 0,
            "hotspare device", 0);
        if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
            &hs_data) != 0)
            return (0);

        /*
         * set_sm_comp_state() commits the modified records.
         * As we don't transmit the changes, no need to drop the lock.
         */
        set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
            MD_STATE_NO_XMIT, (IOLOCK *)NULL);

        (*hs_done)(sm->sm_dev, hs_data);

        mirror_check_failfast(mnum);

        SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
            setno, MD_SID(un));

        /*
         * For a multi-node set we need to reset the un_rs_type,
         * un_rs_resync_done and un_rs_resync_2_do fields as the
         * hot-spare resync must copy all applicable data.
         */
        if (MD_MNSET_SETNO(setno)) {
            un->un_rs_type = MD_RS_NONE;
            un->un_rs_resync_done = 0;
            un->un_rs_resync_2_do = 0;
        }

        /*
         * Must drop writer lock since mirror_resync_unit will
         * open devices and must be able to grab readerlock.
         * Don't need to drop IOLOCK since any descendent routines
         * calling ksend_messages will drop the IOLOCK as needed.
         *
         */
        if (lockp) {
            md_ioctl_writerexit(lockp);
        } else {
            md_unit_writerexit(MDI_UNIT(mnum));
        }

        /* start resync */
        (void) mirror_resync_unit(mnum, NULL, &mde, lockp);

        if (lockp) {
            new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
        } else {
            new_un = md_unit_writerlock(MDI_UNIT(mnum));
        }
    }
    return (0);
}

/*
 * check_unit_4_hotspares
 *
 * For a given mirror, allocate hotspares, if available for any components
 * that are in error
 *
 * Returns  0 if ok
 *      1 if check_comp_4_hotspares returns non-zero. This will only
 *        happen for a MN unit where the unit has been cleared while
 *        the allocate hotspare message is sent to all nodes.
 */
static int
check_unit_4_hotspares(mm_unit_t *un, int flags)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    int         ci;
    int         i;
    int         compcnt;

    if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
        return (0);

    for (i = 0; i < NMIRROR; i++) {
        sm = &un->un_sm[i];
        smic = &un->un_smic[i];
        if (!SMS_IS(sm, SMS_INUSE))
            continue;
        compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
        for (ci = 0; ci < compcnt; ci++) {
            md_m_shared_t       *shared;

            shared = (md_m_shared_t *)
                (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
            /*
             * Never called from ioctl context, so pass in
             * (IOLOCK *)NULL.  Pass through flags from calling
             * routine, also setting XMIT flag.
             */
            if (check_comp_4_hotspares(un, i, ci,
                (MD_HOTSPARE_XMIT | flags),
                shared->ms_hs_id, (IOLOCK *)NULL) != 0)
                return (1);
        }
    }
    return (0);
}

static void
check_4_hotspares(daemon_request_t *drq)
{
    mdi_unit_t  *ui;
    mm_unit_t   *un;
    md_link_t   *next;
    int     x;

    mutex_enter(&drq->dr_mx);   /* clear up front so can poke */
    drq->dr_pending = 0;        /* again in low level routine if */
    mutex_exit(&drq->dr_mx);    /* something found to do    */

    /*
     * Used to have a problem here. The disksets weren't marked as being
     * MNHOLD. This opened a window where we could be searching for
     * hotspares and have the disk set unloaded (released) from under
     * us causing a panic in stripe_component_count().
     * The way to prevent that is to mark the set MNHOLD which prevents
     * any diskset from being released while we are scanning the mirrors,
     * submirrors and components.
     */

    for (x = 0; x < md_nsets; x++)
        md_holdset_enter(x);

    rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
    for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
        ui = MDI_UNIT(next->ln_id);

        un = (mm_unit_t *)md_unit_readerlock(ui);

        /*
         * Only check the unit if we are the master for this set
         * For an MN set, poke_hotspares() is only effective on the
         * master
         */
        if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
            md_set[MD_UN2SET(un)].s_am_i_master == 0) {
            md_unit_readerexit(ui);
            continue;
        }
        if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
            md_unit_readerexit(ui);
            continue;
        }
        md_unit_readerexit(ui);

        un = (mm_unit_t *)md_unit_writerlock(ui);
        /*
         * check_unit_4_hotspares will exit 1 if the unit has been
         * removed during the process of allocating the hotspare.
         * This can only happen for a MN metadevice. If unit no longer
         * exists, no need to release writerlock
         */
        if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
            md_unit_writerexit(ui);
        else {
            /*
             * If check_unit_4_hotspares failed, queue another
             * request and break out of this one
             */
            (void) poke_hotspares();
            break;
        }
    }
    rw_exit(&mirror_md_ops.md_link_rw.lock);

    for (x = 0; x < md_nsets; x++)
        md_holdset_exit(x);
}

/*
 * poke_hotspares
 *
 * If there is not a pending poke_hotspares request pending, queue a requent
 * to call check_4_hotspares(). This will scan all mirrors and attempt to
 * allocate hotspares for all components in error.
 */
int
poke_hotspares()
{
    mutex_enter(&hotspare_request.dr_mx);
    if (hotspare_request.dr_pending == 0) {
        hotspare_request.dr_pending = 1;
        daemon_request(&md_mhs_daemon,
            check_4_hotspares, (daemon_queue_t *)&hotspare_request,
            REQ_OLD);
    }
    mutex_exit(&hotspare_request.dr_mx);
    return (0);
}

static void
free_all_ecomps(err_comp_t *ecomp)
{
    err_comp_t  *d;

    while (ecomp != NULL) {
        d = ecomp;
        ecomp = ecomp->ec_next;
        kmem_free(d, sizeof (err_comp_t));
    }
}

/*
 * NAME: mirror_openfail_console_info
 *
 * DESCRIPTION: Prints a informative message to the console when mirror
 *      cannot be opened.
 *
 * PARAMETERS: mm_unit_t    un - pointer to mirror unit structure
 *         int      smi - submirror index
 *         int      ci - component index
 */

void
mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
{
    void (*get_dev)();
    ms_cd_info_t cd;
    md_dev64_t tmpdev;

    tmpdev = un->un_sm[smi].sm_dev;
    get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
    if (get_dev != NULL) {
        (void) (*get_dev)(tmpdev, smi, ci, &cd);
        cmn_err(CE_WARN, "md %s: open error on %s",
            md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
            cd.cd_dev, NULL, 0));
    } else {
        cmn_err(CE_WARN, "md %s: open error",
            md_shortname(MD_SID(un)));
    }
}

static int
mirror_close_all_devs(mm_unit_t *un, int md_cflags)
{
    int i;
    md_dev64_t dev;

    for (i = 0; i < NMIRROR; i++) {
        if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
            continue;
        dev = un->un_sm[i].sm_dev;
        md_layered_close(dev, md_cflags);
    }
    return (0);
}

/*
 * Keep track of drivers that don't support failfast.  We use this so that
 * we only log one diagnostic message for each of these drivers, no matter
 * how many times we run the mirror_check_failfast function.
 * Return 1 if this is a new driver that does not support failfast,
 * return 0 if we have already seen this non-failfast driver.
 */
static int
new_non_ff_driver(const char *s)
{
    mutex_enter(&non_ff_drv_mutex);
    if (non_ff_drivers == NULL) {
        non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
            KM_NOSLEEP);
        if (non_ff_drivers == NULL) {
            mutex_exit(&non_ff_drv_mutex);
            return (1);
        }

        non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
            KM_NOSLEEP);
        if (non_ff_drivers[0] == NULL) {
            kmem_free(non_ff_drivers, 2 * sizeof (char *));
            non_ff_drivers = NULL;
            mutex_exit(&non_ff_drv_mutex);
            return (1);
        }

        (void) strcpy(non_ff_drivers[0], s);
        non_ff_drivers[1] = NULL;

    } else {
        int i;
        char **tnames;
        char **tmp;

        for (i = 0; non_ff_drivers[i] != NULL; i++) {
            if (strcmp(s, non_ff_drivers[i]) == 0) {
                mutex_exit(&non_ff_drv_mutex);
                return (0);
            }
        }

        /* allow for new element and null */
        i += 2;
        tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
        if (tnames == NULL) {
            mutex_exit(&non_ff_drv_mutex);
            return (1);
        }

        for (i = 0; non_ff_drivers[i] != NULL; i++)
            tnames[i] = non_ff_drivers[i];

        tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
        if (tnames[i] == NULL) {
            /* adjust i so that it is the right count to free */
            kmem_free(tnames, (i + 2) * sizeof (char *));
            mutex_exit(&non_ff_drv_mutex);
            return (1);
        }

        (void) strcpy(tnames[i++], s);
        tnames[i] = NULL;

        tmp = non_ff_drivers;
        non_ff_drivers = tnames;
        /* i now represents the count we previously alloced */
        kmem_free(tmp, i * sizeof (char *));
    }
    mutex_exit(&non_ff_drv_mutex);

    return (1);
}

/*
 * Check for the "ddi-failfast-supported" devtree property on each submirror
 * component to indicate if we should do I/O to that submirror with the
 * B_FAILFAST flag set or not.  This check is made at various state transitions
 * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
 * only need to check one drive (e.g. hotspare) but since the check is
 * fast and infrequent and sometimes needs to be done on all components we
 * just check all components on each call.
 */
void
mirror_check_failfast(minor_t mnum)
{
    int     i;
    mm_unit_t   *un;

    if (md_ff_disable)
        return;

    un = MD_UNIT(mnum);

    for (i = 0; i < NMIRROR; i++) {
        int         ci;
        int         cnt;
        int         ff = 1;
        mm_submirror_t      *sm;
        mm_submirror_ic_t   *smic;
        void            (*get_dev)();

        if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
            continue;

        sm = &un->un_sm[i];
        smic = &un->un_smic[i];

        get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
            "get device", 0);

        cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
        for (ci = 0; ci < cnt; ci++) {
            int     found = 0;
            dev_t       ci_dev;
            major_t     major;
            dev_info_t  *devi;
            ms_cd_info_t    cd;

            /*
             * this already returns the hs
             * dev if the device is spared
             */
            (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);

            ci_dev = md_dev64_to_dev(cd.cd_dev);
            major = getmajor(ci_dev);

            if (major == md_major) {
                /*
                 * this component must be a soft
                 * partition; get the real dev
                 */
                minor_t dev_mnum;
                mdi_unit_t  *ui;
                mp_unit_t   *un;
                set_t   setno;
                side_t  side;
                md_dev64_t  tmpdev;

                ui = MDI_UNIT(getminor(ci_dev));

                /* grab necessary lock */
                un = (mp_unit_t *)md_unit_readerlock(ui);

                dev_mnum = MD_SID(un);
                setno = MD_MIN2SET(dev_mnum);
                side = mddb_getsidenum(setno);

                tmpdev = un->un_dev;

                /* Get dev by device id */
                if (md_devid_found(setno, side,
                    un->un_key) == 1) {
                    tmpdev = md_resolve_bydevid(dev_mnum,
                        tmpdev, un->un_key);
                }

                md_unit_readerexit(ui);

                ci_dev = md_dev64_to_dev(tmpdev);
                major = getmajor(ci_dev);
            }

            if (ci_dev != NODEV32 &&
                (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
                != NULL) {
                ddi_prop_op_t   prop_op = PROP_LEN_AND_VAL_BUF;
                int     propvalue = 0;
                int     proplength = sizeof (int);
                int     error;
                struct cb_ops   *cb;

                if ((cb = devopsp[major]->devo_cb_ops) !=
                    NULL) {
                    error = (*cb->cb_prop_op)
                        (DDI_DEV_T_ANY, devi, prop_op,
                        DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
                        "ddi-failfast-supported",
                        (caddr_t)&propvalue, &proplength);

                    if (error == DDI_PROP_SUCCESS)
                        found = 1;
                }

                if (!found && new_non_ff_driver(
                    ddi_driver_name(devi))) {
                    cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
                        "disabled on %s",
                        ddi_driver_name(devi));
                }

                ddi_release_devi(devi);
            }

            /*
             * All components must support
             * failfast in the submirror.
             */
            if (!found) {
                ff = 0;
                break;
            }
        }

        if (ff) {
            sm->sm_flags |= MD_SM_FAILFAST;
        } else {
            sm->sm_flags &= ~MD_SM_FAILFAST;
        }
    }
}

/*
 * Return true if the submirror is unavailable.
 * If any of the submirror components are opened then the submirror cannot
 * be unavailable (MD_INACCESSIBLE).
 * If any of the components are already in the errored state, then the submirror
 * cannot be unavailable (MD_INACCESSIBLE).
 */
static bool_t
submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    md_m_shared_t       *shared;
    int         ci;
    int         compcnt;

    sm = &un->un_sm[smi];
    smic = &un->un_smic[smi];

    compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
    for (ci = 0; ci < compcnt; ci++) {
        shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
            (sm->sm_dev, sm, ci);
        if (from_probe) {
            if (shared->ms_flags & MDM_S_PROBEOPEN)
                return (B_FALSE);
        } else {
            if (shared->ms_flags & MDM_S_ISOPEN)
                return (B_FALSE);
        }
        if (shared->ms_state == CS_ERRED ||
            shared->ms_state == CS_LAST_ERRED)
            return (B_FALSE);
    }

    return (B_TRUE);
}

static int
mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
{
    int     i;
    mm_unit_t   *un;
    mdi_unit_t  *ui;
    int     err;
    int     smi;
    int     ci;
    err_comp_t  *c;
    err_comp_t  *ecomps = NULL;
    int     smmask = 0;
    set_t       setno;
    int     sm_cnt;
    int     sm_unavail_cnt;

    mirror_check_failfast(mnum);

    un = MD_UNIT(mnum);
    ui = MDI_UNIT(mnum);
    setno = MD_UN2SET(un);

    for (i = 0; i < NMIRROR; i++) {
        md_dev64_t tmpdev = un->un_sm[i].sm_dev;

        if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
            continue;
        if (md_layered_open(mnum, &tmpdev, md_oflags))
            smmask |= SMI2BIT(i);
        un->un_sm[i].sm_dev = tmpdev;
    }

    /*
     * If smmask is clear, all submirrors are accessible. Clear the
     * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
     * mirror device.   If smmask is set, we have to determine which of the
     * submirrors are in error. If no submirror is accessible we mark the
     * whole mirror as MD_INACCESSIBLE.
     */
    if (smmask == 0) {
        if (lockp) {
            md_ioctl_readerexit(lockp);
            (void) md_ioctl_writerlock(lockp, ui);
        } else {
            md_unit_readerexit(ui);
            (void) md_unit_writerlock(ui);
        }
        ui->ui_tstate &= ~MD_INACCESSIBLE;
        if (lockp) {
            md_ioctl_writerexit(lockp);
            (void) md_ioctl_readerlock(lockp, ui);
        } else {
            md_unit_writerexit(ui);
            (void) md_unit_readerlock(ui);
        }

        for (i = 0; i < NMIRROR; i++) {
            md_dev64_t  tmpdev;
            mdi_unit_t  *sm_ui;

            if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
                continue;

            tmpdev = un->un_sm[i].sm_dev;
            sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
            (void) md_unit_writerlock(sm_ui);
            sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
            md_unit_writerexit(sm_ui);
        }

        return (0);
    }

    for (i = 0; i < NMIRROR; i++) {
        md_dev64_t tmpdev;

        if (!(smmask & SMI2BIT(i)))
            continue;

        tmpdev = un->un_sm[i].sm_dev;
        err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
        un->un_sm[i].sm_dev = tmpdev;
        ASSERT(err == 0);
    }

    if (lockp) {
        md_ioctl_readerexit(lockp);
        un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
    } else {
        md_unit_readerexit(ui);
        un = (mm_unit_t *)md_unit_writerlock(ui);
    }

    /*
     * We want to make sure the unavailable flag is not masking a real
     * error on the submirror.
     * For each submirror,
     *    if all of the submirror components couldn't be opened and there
     *    are no errors on the submirror, then set the unavailable flag
     *    otherwise, clear unavailable.
     */
    sm_cnt = 0;
    sm_unavail_cnt = 0;
    for (i = 0; i < NMIRROR; i++) {
        md_dev64_t  tmpdev;
        mdi_unit_t  *sm_ui;

        if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
            continue;

        sm_cnt++;
        tmpdev = un->un_sm[i].sm_dev;
        sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));

        (void) md_unit_writerlock(sm_ui);
        if (submirror_unavailable(un, i, 0)) {
            sm_ui->ui_tstate |= MD_INACCESSIBLE;
            sm_unavail_cnt++;
        } else {
            sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
        }
        md_unit_writerexit(sm_ui);
    }

    /*
     * If all of the submirrors are unavailable, the mirror is also
     * unavailable.
     */
    if (sm_cnt == sm_unavail_cnt) {
        ui->ui_tstate |= MD_INACCESSIBLE;
    } else {
        ui->ui_tstate &= ~MD_INACCESSIBLE;
    }

    smi = 0;
    ci = 0;
    while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
        if (mirror_other_sources(un, smi, ci, 1) == 1) {

            free_all_ecomps(ecomps);
            (void) mirror_close_all_devs(un, md_oflags);
            SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
                SVM_TAG_METADEVICE, setno, MD_SID(un));
            mirror_openfail_console_info(un, smi, ci);
            if (lockp) {
                md_ioctl_writerexit(lockp);
                (void) md_ioctl_readerlock(lockp, ui);
            } else {
                md_unit_writerexit(ui);
                (void) md_unit_readerlock(ui);
            }
            return (ENXIO);
        }

        /* track all component states that need changing */
        c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
        c->ec_next = ecomps;
        c->ec_smi = smi;
        c->ec_ci = ci;
        ecomps = c;
        ci++;
    }

    /* Make all state changes and commit them */
    for (c = ecomps; c != NULL; c = c->ec_next) {
        /*
         * If lockp is set, then entering kernel through ioctl.
         * For a MN set, the only ioctl path is via a commd message
         * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
         * being sent to each node.
         * In this case, set NO_XMIT so that set_sm_comp_state
         * won't attempt to send a message on a message.
         *
         * In !MN sets, the xmit flag is ignored, so it doesn't matter
         * which flag is passed.
         */
        if (lockp) {
            set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
                MD_STATE_NO_XMIT, lockp);
        } else {
            set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
                (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
        }
        /*
         * For a MN set, the NOTIFY is done when the state change is
         * processed on each node
         */
        if (!MD_MNSET_SETNO(setno)) {
            SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
                SVM_TAG_METADEVICE, setno, MD_SID(un));
        }
    }

    if (lockp) {
        md_ioctl_writerexit(lockp);
        (void) md_ioctl_readerlock(lockp, ui);
    } else {
        md_unit_writerexit(ui);
        (void) md_unit_readerlock(ui);
    }

    free_all_ecomps(ecomps);

    /* allocate hotspares for all errored components */
    if (MD_MNSET_SETNO(setno)) {
        /*
         * If we're called from an ioctl (lockp set) then we cannot
         * directly call send_poke_hotspares as this will block until
         * the message gets despatched to all nodes. If the cluster is
         * going through a reconfig cycle then the message will block
         * until the cycle is complete, and as we originate from a
         * service call from commd we will livelock.
         */
        if (lockp == NULL) {
            md_unit_readerexit(ui);
            send_poke_hotspares(setno);
            (void) md_unit_readerlock(ui);
        }
    } else {
        (void) poke_hotspares();
    }
    return (0);
}

void
mirror_overlap_tree_remove(md_mps_t *ps)
{
    mm_unit_t   *un;

    if (panicstr)
        return;

    VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
    un = ps->ps_un;

    mutex_enter(&un->un_overlap_tree_mx);
    avl_remove(&un->un_overlap_root, ps);
    ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
    if (un->un_overlap_tree_flag != 0) {
        un->un_overlap_tree_flag = 0;
        cv_broadcast(&un->un_overlap_tree_cv);
    }
    mutex_exit(&un->un_overlap_tree_mx);
}


/*
 * wait_for_overlaps:
 * -----------------
 * Check that given i/o request does not cause an overlap with already pending
 * i/o. If it does, block until the overlapped i/o completes.
 *
 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
 * it must not already be in the tree.
 */
static void
wait_for_overlaps(md_mps_t *ps, int flags)
{
    mm_unit_t   *un;
    avl_index_t where;
    md_mps_t    *ps1;

    if (panicstr)
        return;

    un = ps->ps_un;
    mutex_enter(&un->un_overlap_tree_mx);
    if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
        (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
        mutex_exit(&un->un_overlap_tree_mx);
        return;
    }

    VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));

    do {
        ps1 = avl_find(&un->un_overlap_root, ps, &where);
        if (ps1 == NULL) {
            /*
             * The candidate range does not overlap with any
             * range in the tree.  Insert it and be done.
             */
            avl_insert(&un->un_overlap_root, ps, where);
            ps->ps_flags |= MD_MPS_ON_OVERLAP;
        } else {
            /*
             * The candidate range would overlap.  Set the flag
             * indicating we need to be woken up, and sleep
             * until another thread removes a range.  If upon
             * waking up we find this mps was put on the tree
             * by another thread, the loop terminates.
             */
            un->un_overlap_tree_flag = 1;
            cv_wait(&un->un_overlap_tree_cv,
                &un->un_overlap_tree_mx);
        }
    } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
    mutex_exit(&un->un_overlap_tree_mx);
}

/*
 * This function is called from mirror_done to check whether any pages have
 * been modified while a mirrored write was in progress.  Returns 0 if
 * all pages associated with bp are clean, 1 otherwise.
 */
static int
any_pages_dirty(struct buf *bp)
{
    int rval;

    rval = biomodified(bp);
    if (rval == -1)
        rval = 0;

    return (rval);
}

#define MAX_EXTRAS 10

void
mirror_commit(
    mm_unit_t   *un,
    int     smmask,
    mddb_recid_t    *extras
)
{
    mm_submirror_t      *sm;
    md_unit_t       *su;
    int         i;

    /* 2=mirror,null id */
    mddb_recid_t        recids[NMIRROR+2+MAX_EXTRAS];

    int         ri = 0;

    if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
        return;

    /* Add two, this includes the mirror unit and the null recid */
    if (extras != NULL) {
        int nrecids = 0;
        while (extras[nrecids] != 0) {
            nrecids++;
        }
        ASSERT(nrecids <= MAX_EXTRAS);
    }

    if (un != NULL)
        recids[ri++] = un->c.un_record_id;
    for (i = 0;  i < NMIRROR; i++) {
        if (!(smmask & SMI2BIT(i)))
            continue;
        sm = &un->un_sm[i];
        if (!SMS_IS(sm, SMS_INUSE))
            continue;
        if (md_getmajor(sm->sm_dev) != md_major)
            continue;
        su =  MD_UNIT(md_getminor(sm->sm_dev));
        recids[ri++] = su->c.un_record_id;
    }

    if (extras != NULL)
        while (*extras != 0) {
            recids[ri++] = *extras;
            extras++;
        }

    if (ri == 0)
        return;
    recids[ri] = 0;

    /*
     * Ok to hold ioctl lock across record commit to mddb as
     * long as the record(s) being committed aren't resync records.
     */
    mddb_commitrecs_wrapper(recids);
}


/*
 * This routine is used to set a bit in the writable_bm bitmap
 * which represents each submirror in a metamirror which
 * is writable. The first writable submirror index is assigned
 * to the sm_index.  The number of writable submirrors are returned in nunits.
 *
 * This routine returns the submirror's unit number.
 */

static void
select_write_units(struct mm_unit *un, md_mps_t *ps)
{

    int     i;
    unsigned    writable_bm = 0;
    unsigned    nunits = 0;

    for (i = 0; i < NMIRROR; i++) {
        if (SUBMIRROR_IS_WRITEABLE(un, i)) {
            /* set bit of all writable units */
            writable_bm |= SMI2BIT(i);
            nunits++;
        }
    }
    ps->ps_writable_sm = writable_bm;
    ps->ps_active_cnt = nunits;
    ps->ps_current_sm = 0;
}

static
unsigned
select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
{

    int     i;
    unsigned    writable_bm = 0;
    unsigned    nunits = 0;

    for (i = 0; i < NMIRROR; i++) {
        if (SUBMIRROR_IS_WRITEABLE(un, i) &&
            un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
            writable_bm |= SMI2BIT(i);
            nunits++;
        }
    }
    if ((writable_bm & ps->ps_allfrom_sm) != 0) {
        writable_bm &= ~ps->ps_allfrom_sm;
        nunits--;
    }
    ps->ps_writable_sm = writable_bm;
    ps->ps_active_cnt = nunits;
    ps->ps_current_sm = 0;
    return (nunits);
}

static md_dev64_t
select_read_unit(
    mm_unit_t   *un,
    diskaddr_t  blkno,
    u_longlong_t    reqcount,
    u_longlong_t    *cando,
    int     must_be_opened,
    md_m_shared_t   **shared,
    md_mcs_t    *cs)
{
    int         i;
    md_m_shared_t       *s;
    uint_t          lasterrcnt = 0;
    md_dev64_t      dev = 0;
    u_longlong_t        cnt;
    u_longlong_t        mincnt;
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    mdi_unit_t      *ui;

    mincnt = reqcount;
    for (i = 0; i < NMIRROR; i++) {
        if (!SUBMIRROR_IS_READABLE(un, i))
            continue;
        sm = &un->un_sm[i];
        smic = &un->un_smic[i];
        cnt = reqcount;

        /*
         * If the current submirror is marked as inaccessible, do not
         * try to access it.
         */
        ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
        (void) md_unit_readerlock(ui);
        if (ui->ui_tstate & MD_INACCESSIBLE) {
            md_unit_readerexit(ui);
            continue;
        }
        md_unit_readerexit(ui);

        s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
            (sm->sm_dev, sm, blkno, &cnt);

        if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
            continue;
        if (s->ms_state == CS_OKAY) {
            *cando = cnt;
            if (shared != NULL)
                *shared = s;

            if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
                cs != NULL) {
                cs->cs_buf.b_flags |= B_FAILFAST;
            }

            return (un->un_sm[i].sm_dev);
        }
        if (s->ms_state != CS_LAST_ERRED)
            continue;

        /* don't use B_FAILFAST since we're Last Erred */

        if (mincnt > cnt)
            mincnt = cnt;
        if (s->ms_lasterrcnt > lasterrcnt) {
            lasterrcnt = s->ms_lasterrcnt;
            if (shared != NULL)
                *shared = s;
            dev = un->un_sm[i].sm_dev;
        }
    }
    *cando = mincnt;
    return (dev);
}

/*
 * Given a 32-bit bitmap, this routine will return the bit number
 * of the nth bit set.  The nth bit set is passed via the index integer.
 *
 * This routine is used to run through the writable submirror bitmap
 * and starting all of the writes.  See the value returned is the
 * index to appropriate submirror structure, in the md_sm
 * array for metamirrors.
 */
static int
md_find_nth_unit(uint_t mask, int index)
{
    int bit, nfound;

    for (bit = -1, nfound = -1; nfound != index; bit++) {
        ASSERT(mask != 0);
        nfound += (mask & 1);
        mask >>= 1;
    }
    return (bit);
}

static int
fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
{
    mm_unit_t   *un;
    buf_t       *bp;
    int     i;
    unsigned    nunits = 0;
    int     iunit;
    uint_t      running_bm = 0;
    uint_t      sm_index;

    bp = &cs->cs_buf;
    un = ps->ps_un;

    for (i = 0; i < NMIRROR; i++) {
        if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
            continue;
        running_bm |= SMI2BIT(i);
        nunits++;
    }
    if (nunits == 0)
        return (1);

    /*
     * For directed mirror read (DMR) we only use the specified side and
     * do not compute the source of the read.
     * If we're running with MD_MPS_DIRTY_RD set we always return the
     * first mirror side (this prevents unnecessary ownership switching).
     * Otherwise we return the submirror according to the mirror read option
     */
    if (ps->ps_flags & MD_MPS_DMR) {
        sm_index = un->un_dmr_last_read;
    } else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
        sm_index = md_find_nth_unit(running_bm, 0);
    } else {
        /* Normal (non-DMR) operation */
        switch (un->un_read_option) {
        case RD_GEOMETRY:
            iunit = (int)(bp->b_lblkno /
                howmany(un->c.un_total_blocks, nunits));
            sm_index = md_find_nth_unit(running_bm, iunit);
            break;
        case RD_FIRST:
            sm_index = md_find_nth_unit(running_bm, 0);
            break;
        case RD_LOAD_BAL:
            /* this is intentional to fall into the default */
        default:
            un->un_last_read = (un->un_last_read + 1) % nunits;
            sm_index = md_find_nth_unit(running_bm,
                un->un_last_read);
            break;
        }
    }
    bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
    ps->ps_allfrom_sm = SMI2BIT(sm_index);

    if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
        bp->b_flags |= B_FAILFAST;
    }

    return (0);
}

static
int
mirror_are_submirrors_available(mm_unit_t *un)
{
    int i;
    for (i = 0; i < NMIRROR; i++) {
        md_dev64_t tmpdev = un->un_sm[i].sm_dev;

        if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
            md_getmajor(tmpdev) != md_major)
            continue;

        if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
            (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
            return (0);

        if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
            return (0);
    }
    return (1);
}

void
build_submirror(mm_unit_t *un, int i, int snarfing)
{
    struct mm_submirror *sm;
    struct mm_submirror_ic  *smic;
    md_unit_t       *su;
    set_t           setno;

    sm = &un->un_sm[i];
    smic = &un->un_smic[i];

    sm->sm_flags = 0; /* sometime we may need to do more here */

    setno = MD_UN2SET(un);

    if (!SMS_IS(sm, SMS_INUSE))
        return;
    if (snarfing) {
        sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
            sm->sm_key, MD_NOTRUST_DEVT);
    } else {
        if (md_getmajor(sm->sm_dev) == md_major) {
            su = MD_UNIT(md_getminor(sm->sm_dev));
            un->c.un_flag |= (su->c.un_flag & MD_LABELED);
            /* submirror can no longer be soft partitioned */
            MD_CAPAB(su) &= (~MD_CAN_SP);
        }
    }
    smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
        0, "shared by blk", 0);
    smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
        0, "shared by indx", 0);
    smic->sm_get_component_count = (int (*)())md_get_named_service(
        sm->sm_dev, 0, "get component count", 0);
    smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
        "get block count skip size", 0);
    sm->sm_state &= ~SMS_IGNORE;
    if (SMS_IS(sm, SMS_OFFLINE))
        MD_STATUS(un) |= MD_UN_OFFLINE_SM;
    md_set_parent(sm->sm_dev, MD_SID(un));
}

static void
mirror_cleanup(mm_unit_t *un)
{
    mddb_recid_t    recid;
    int     smi;
    sv_dev_t    sv[NMIRROR];
    int     nsv = 0;

    /*
     * If a MN diskset and this node is not the master, do
     * not delete any records on snarf of the mirror records.
     */
    if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
        md_set[MD_UN2SET(un)].s_am_i_master == 0) {
        return;
    }

    for (smi = 0; smi < NMIRROR; smi++) {
        if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
            continue;
        sv[nsv].setno = MD_UN2SET(un);
        sv[nsv++].key = un->un_sm[smi].sm_key;
    }

    recid = un->un_rr_dirty_recid;
    mddb_deleterec_wrapper(un->c.un_record_id);
    if (recid > 0)
        mddb_deleterec_wrapper(recid);

    md_rem_names(sv, nsv);
}

/*
 * Comparison function for the avl tree which tracks
 * outstanding writes on submirrors.
 *
 * Returns:
 *  -1: ps1 < ps2
 *   0: ps1 and ps2 overlap
 *   1: ps1 > ps2
 */
static int
mirror_overlap_compare(const void *p1, const void *p2)
{
    const md_mps_t *ps1 = (md_mps_t *)p1;
    const md_mps_t *ps2 = (md_mps_t *)p2;

    if (ps1->ps_firstblk < ps2->ps_firstblk) {
        if (ps1->ps_lastblk >= ps2->ps_firstblk)
            return (0);
        return (-1);
    }

    if (ps1->ps_firstblk > ps2->ps_firstblk) {
        if (ps1->ps_firstblk <= ps2->ps_lastblk)
            return (0);
        return (1);
    }

    return (0);
}

/*
 * Collapse any sparse submirror entries snarfed from the on-disk replica.
 * Only the in-core entries are updated. The replica will be updated on-disk
 * when the in-core replica is committed on shutdown of the SVM subsystem.
 */
static void
collapse_submirrors(mm_unit_t *un)
{
    int         smi, nremovals, smiremove;
    mm_submirror_t      *sm, *new_sm, *old_sm;
    mm_submirror_ic_t   *smic;
    int         nsmidx = un->un_nsm - 1;

rescan:
    nremovals = 0;
    smiremove = -1;

    for (smi = 0; smi <= nsmidx; smi++) {
        sm = &un->un_sm[smi];

        /*
         * Check to see if this submirror is marked as in-use.
         * If it isn't then it is a potential sparse entry and
         * may need to be cleared from the configuration.
         * The records should _already_ have been cleared by the
         * original mirror_detach() code, but we need to shuffle
         * any NULL entries in un_sm[] to the end of the array.
         * Any NULL un_smic[] entries need to be reset to the underlying
         * submirror/slice accessor functions.
         */
        if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
            nremovals++;
            smiremove = smi;
            break;
        }
    }

    if (nremovals == 0) {
        /*
         * Ensure that we have a matching contiguous set of un_smic[]
         * entries for the corresponding un_sm[] entries
         */
        for (smi = 0; smi <= nsmidx; smi++) {
            smic = &un->un_smic[smi];
            sm = &un->un_sm[smi];

            smic->sm_shared_by_blk =
                md_get_named_service(sm->sm_dev, 0,
                "shared by_blk", 0);
            smic->sm_shared_by_indx =
                md_get_named_service(sm->sm_dev, 0,
                "shared by indx", 0);
            smic->sm_get_component_count =
                (int (*)())md_get_named_service(sm->sm_dev, 0,
                "get component count", 0);
            smic->sm_get_bcss =
                (int (*)())md_get_named_service(sm->sm_dev, 0,
                "get block count skip size", 0);
        }
        return;
    }

    /*
     * Reshuffle the submirror devices so that we do not have a dead record
     * in the middle of the array. Once we've done this we need to rescan
     * the mirror to check for any other holes.
     */
    for (smi = 0; smi < NMIRROR; smi++) {
        if (smi < smiremove)
            continue;
        if (smi > smiremove) {
            old_sm = &un->un_sm[smi];
            new_sm = &un->un_sm[smi - 1];
            bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
            bzero(old_sm, sizeof (mm_submirror_t));
        }
    }

    /*
     * Now we need to rescan the array to find the next potential dead
     * entry.
     */
    goto rescan;
}

/* Return a -1 if optimized record unavailable and set should be released */
int
mirror_build_incore(mm_unit_t *un, int snarfing)
{
    int     i;

    if (MD_STATUS(un) & MD_UN_BEING_RESET) {
        mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
        return (1);
    }

    if (mirror_are_submirrors_available(un) == 0)
        return (1);

    if (MD_UNIT(MD_SID(un)) != NULL)
        return (0);

    MD_STATUS(un) = 0;

    /* pre-4.1 didn't define CAN_META_CHILD capability */
    MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;

    un->un_overlap_tree_flag = 0;
    avl_create(&un->un_overlap_root, mirror_overlap_compare,
        sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));

    /*
     * We need to collapse any sparse submirror entries into a non-sparse
     * array. This is to cover the case where we have an old replica image
     * which has not been updated (i.e. snarfed) since being modified.
     * The new code expects all submirror access to be sequential (i.e.
     * both the un_sm[] and un_smic[] entries correspond to non-empty
     * submirrors.
     */

    collapse_submirrors(un);

    for (i = 0; i < NMIRROR; i++)
        build_submirror(un, i, snarfing);

    if (unit_setup_resync(un, snarfing) != 0) {
        if (snarfing) {
            mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
            /*
             * If a MN set and set is not stale, then return -1
             * which will force the caller to unload the set.
             * The MN diskset nodes will return failure if
             * unit_setup_resync fails so that nodes won't
             * get out of sync.
             *
             * If set is STALE, the master node can't allocate
             * a resync record (if needed), but node needs to
             * join the set so that user can delete broken mddbs.
             * So, if set is STALE, just continue on.
             */
            if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
                !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
                return (-1);
            }
        } else
            return (1);
    }

    mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);

    un->un_suspend_wr_flag = 0;
    mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);

    /*
     * Allocate mutexes for mirror-owner and resync-owner changes.
     * All references to the owner message state field must be guarded
     * by this mutex.
     */
    mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);

    /*
     * Allocate mutex and condvar for resync thread manipulation. These
     * will be used by mirror_resync_unit/mirror_ioctl_resync
     */
    mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);

    /*
     * Allocate mutex and condvar for resync progress thread manipulation.
     * This allows resyncs to be continued across an intervening reboot.
     */
    mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);

    /*
     * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
     * provides synchronization between a user-ioctl and the resulting
     * strategy() call that performs the read().
     */
    mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);

    /*
     * Allocate rwlocks for un_pernode_dirty_bm accessing.
     */
    for (i = 0; i < MD_MNMAXSIDES; i++) {
        rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
    }

    /* place various information in the in-core data structures */
    md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
    MD_UNIT(MD_SID(un)) = un;

    return (0);
}


void
reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
{
    mddb_recid_t    recid, vtoc_id;
    size_t      bitcnt;
    size_t      shortcnt;
    int     smi;
    sv_dev_t    sv[NMIRROR];
    int     nsv = 0;
    uint_t      bits = 0;
    minor_t     selfid;
    md_unit_t   *su;
    int     i;

    md_destroy_unit_incore(mnum, &mirror_md_ops);

    shortcnt = un->un_rrd_num * sizeof (short);
    bitcnt = howmany(un->un_rrd_num, NBBY);

    if (un->un_outstanding_writes)
        kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
    if (un->un_goingclean_bm)
        kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
    if (un->un_goingdirty_bm)
        kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
    if (un->un_resync_bm)
        kmem_free((caddr_t)un->un_resync_bm, bitcnt);
    if (un->un_pernode_dirty_sum)
        kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);

    /*
     * Destroy the taskq for deferred processing of DRL clean requests.
     * This taskq will only be present for Multi Owner mirrors.
     */
    if (un->un_drl_task != NULL)
        ddi_taskq_destroy(un->un_drl_task);

    md_nblocks_set(mnum, -1ULL);
    MD_UNIT(mnum) = NULL;

    /*
     * Attempt release of its minor node
     */
    md_remove_minor_node(mnum);

    if (!removing)
        return;

    for (smi = 0; smi < NMIRROR; smi++) {
        if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
            continue;
        /* reallow soft partitioning of submirror and reset parent */
        su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
        MD_CAPAB(su) |= MD_CAN_SP;
        md_reset_parent(un->un_sm[smi].sm_dev);
        reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);

        sv[nsv].setno = MD_MIN2SET(mnum);
        sv[nsv++].key = un->un_sm[smi].sm_key;
        bits |= SMI2BIT(smi);
    }

    MD_STATUS(un) |= MD_UN_BEING_RESET;
    recid = un->un_rr_dirty_recid;
    vtoc_id = un->c.un_vtoc_id;
    selfid = MD_SID(un);

    mirror_commit(un, bits, 0);

    avl_destroy(&un->un_overlap_root);

    /* Destroy all mutexes and condvars before returning. */
    mutex_destroy(&un->un_suspend_wr_mx);
    cv_destroy(&un->un_suspend_wr_cv);
    mutex_destroy(&un->un_overlap_tree_mx);
    cv_destroy(&un->un_overlap_tree_cv);
    mutex_destroy(&un->un_owner_mx);
    mutex_destroy(&un->un_rs_thread_mx);
    cv_destroy(&un->un_rs_thread_cv);
    mutex_destroy(&un->un_rs_progress_mx);
    cv_destroy(&un->un_rs_progress_cv);
    mutex_destroy(&un->un_dmr_mx);
    cv_destroy(&un->un_dmr_cv);

    for (i = 0; i < MD_MNMAXSIDES; i++) {
        rw_destroy(&un->un_pernode_dirty_mx[i]);
        if (un->un_pernode_dirty_bm[i])
            kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
    }

    /*
     * Remove self from the namespace
     */
    if (un->c.un_revision & MD_FN_META_DEV) {
        (void) md_rem_selfname(un->c.un_self_id);
    }

    /* This frees the unit structure. */
    mddb_deleterec_wrapper(un->c.un_record_id);

    if (recid != 0)
        mddb_deleterec_wrapper(recid);

    /* Remove the vtoc, if present */
    if (vtoc_id)
        mddb_deleterec_wrapper(vtoc_id);

    md_rem_names(sv, nsv);

    SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
        MD_MIN2SET(selfid), selfid);
}

int
mirror_internal_open(
    minor_t     mnum,
    int     flag,
    int     otyp,
    int     md_oflags,
    IOLOCK      *lockp      /* can be NULL */
)
{
    mdi_unit_t  *ui = MDI_UNIT(mnum);
    int     err = 0;

tryagain:
    /* single thread */
    if (lockp) {
        /*
         * If ioctl lock is held, use openclose_enter
         * routine that will set the ioctl flag when
         * grabbing the readerlock.
         */
        (void) md_ioctl_openclose_enter(lockp, ui);
    } else {
        (void) md_unit_openclose_enter(ui);
    }

    /*
     * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
     * message in a MN diskset and this requires that the openclose
     * lock is dropped in order to send this message.  So, another
     * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
     * attempting an open while this thread has an open in progress.
     * Call the *_lh version of the lock exit routines since the ui_mx
     * mutex must be held from checking for OPENINPROGRESS until
     * after the cv_wait call.
     */
    mutex_enter(&ui->ui_mx);
    if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
        if (lockp) {
            (void) md_ioctl_openclose_exit_lh(lockp);
        } else {
            md_unit_openclose_exit_lh(ui);
        }
        cv_wait(&ui->ui_cv, &ui->ui_mx);
        mutex_exit(&ui->ui_mx);
        goto tryagain;
    }

    ui->ui_lock |= MD_UL_OPENINPROGRESS;
    mutex_exit(&ui->ui_mx);

    /* open devices, if necessary */
    if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
        if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
            goto out;
    }

    /* count open */
    if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
        goto out;

    /* unlock, return success */
out:
    mutex_enter(&ui->ui_mx);
    ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
    mutex_exit(&ui->ui_mx);

    if (lockp) {
        /*
         * If ioctl lock is held, use openclose_exit
         * routine that will clear the lockp reader flag.
         */
        (void) md_ioctl_openclose_exit(lockp);
    } else {
        md_unit_openclose_exit(ui);
    }
    return (err);
}

int
mirror_internal_close(
    minor_t     mnum,
    int     otyp,
    int     md_cflags,
    IOLOCK      *lockp      /* can be NULL */
)
{
    mdi_unit_t  *ui = MDI_UNIT(mnum);
    mm_unit_t   *un;
    int     err = 0;

    /* single thread */
    if (lockp) {
        /*
         * If ioctl lock is held, use openclose_enter
         * routine that will set the ioctl flag when
         * grabbing the readerlock.
         */
        un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
    } else {
        un = (mm_unit_t *)md_unit_openclose_enter(ui);
    }

    /* count closed */
    if ((err = md_unit_decopen(mnum, otyp)) != 0)
        goto out;

    /* close devices, if necessary */
    if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
        /*
         * Clean up dirty bitmap for this unit. Do this
         * before closing the underlying devices to avoid
         * race conditions with reset_mirror() as a
         * result of a 'metaset -r' command running in
         * parallel. This might cause deallocation of
         * dirty region bitmaps; with underlying metadevices
         * in place this can't happen.
         * Don't do this if a MN set and ABR not set
         */
        if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
            if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
                !(ui->ui_tstate & MD_ABR_CAP))
                mirror_process_unit_resync(un);
        }
        (void) mirror_close_all_devs(un, md_cflags);

        /*
         * For a MN set with transient capabilities (eg ABR/DMR) set,
         * clear these capabilities on the last open in the cluster.
         * To do this we send a message to all nodes to see of the
         * device is open.
         */
        if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
            (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
            if (lockp) {
                (void) md_ioctl_openclose_exit(lockp);
            } else {
                md_unit_openclose_exit(ui);
            }

            /*
             * if we are in the context of an ioctl, drop the
             * ioctl lock.
             * Otherwise, no other locks should be held.
             */
            if (lockp) {
                IOLOCK_RETURN_RELEASE(0, lockp);
            }

            mdmn_clear_all_capabilities(mnum);

            /* if dropped the lock previously, regain it */
            if (lockp) {
                IOLOCK_RETURN_REACQUIRE(lockp);
            }
            return (0);
        }
        /* unlock and return success */
    }
out:
    /* Call whether lockp is NULL or not. */
    if (lockp) {
        md_ioctl_openclose_exit(lockp);
    } else {
        md_unit_openclose_exit(ui);
    }
    return (err);
}

/*
 * When a component has completed resyncing and is now ok, check if the
 * corresponding component in the other submirrors is in the Last Erred
 * state.  If it is, we want to change that to the Erred state so we stop
 * using that component and start using this good component instead.
 *
 * This is called from set_sm_comp_state and recursively calls
 * set_sm_comp_state if it needs to change the Last Erred state.
 */
static void
reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
    IOLOCK *lockp)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    int         ci;
    int         i;
    int         compcnt;
    int         changed = 0;

    for (i = 0; i < NMIRROR; i++) {
        sm = &un->un_sm[i];
        smic = &un->un_smic[i];

        if (!SMS_IS(sm, SMS_INUSE))
            continue;

        /* ignore the submirror that we just made ok */
        if (i == smi)
            continue;

        compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
        for (ci = 0; ci < compcnt; ci++) {
            md_m_shared_t   *shared;

            shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
                (sm->sm_dev, sm, ci);

            if ((shared->ms_state & CS_LAST_ERRED) &&
                !mirror_other_sources(un, i, ci, 1)) {

                set_sm_comp_state(un, i, ci, CS_ERRED, extras,
                    flags, lockp);
                changed = 1;
            }
        }
    }

    /* maybe there is a hotspare for this newly erred component */
    if (changed) {
        set_t   setno;

        setno = MD_UN2SET(un);
        if (MD_MNSET_SETNO(setno)) {
            send_poke_hotspares(setno);
        } else {
            (void) poke_hotspares();
        }
    }
}

/*
 * set_sm_comp_state
 *
 * Set the state of a submirror component to the specified new state.
 * If the mirror is in a multi-node set, send messages to all nodes to
 * block all writes to the mirror and then update the state and release the
 * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
 * MD_STATE_XMIT will be unset in 2 cases:
 * 1. When the state is changed to CS_RESYNC as this state change
 * will already have been updated on each node by the processing of the
 * distributed metasync command, hence no need to xmit.
 * 2. When the state is change to CS_OKAY after a resync has completed. Again
 * the resync completion will already have been processed on each node by
 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
 * resync, hence no need to xmit.
 *
 * In case we are called from the updates of a watermark,
 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
 * a metainit or similar. In this case the message that we sent to propagate
 * the state change must not be a class1 message as that would deadlock with
 * the metainit command that is still being processed.
 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
 * instead. This also makes the submessage generator to create a class2
 * submessage rather than a class1 (which would also block)
 *
 * On entry, unit_writerlock is held
 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
 * also held.
 */
void
set_sm_comp_state(
    mm_unit_t   *un,
    int     smi,
    int     ci,
    int     newstate,
    mddb_recid_t    *extras,
    uint_t      flags,
    IOLOCK      *lockp
)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    md_m_shared_t       *shared;
    int         origstate;
    void            (*get_dev)();
    ms_cd_info_t        cd;
    char            devname[MD_MAX_CTDLEN];
    int         err;
    set_t           setno = MD_UN2SET(un);
    md_mn_msg_stch_t    stchmsg;
    mdi_unit_t      *ui = MDI_UNIT(MD_SID(un));
    md_mn_kresult_t     *kresult;
    int         rval;
    uint_t          msgflags;
    md_mn_msgtype_t     msgtype;
    int         save_lock = 0;
    mdi_unit_t      *ui_sm;
    int         nretries = 0;

    sm = &un->un_sm[smi];
    smic = &un->un_smic[smi];

    /* If we have a real error status then turn off MD_INACCESSIBLE. */
    ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
    if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
        ui_sm->ui_tstate & MD_INACCESSIBLE) {
        ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
    }

    shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
        (sm->sm_dev, sm, ci);
    origstate = shared->ms_state;

    /*
     * If the new state is an error and the old one wasn't, generate
     * a console message. We do this before we send the state to other
     * nodes in a MN set because the state change may change the component
     * name  if a hotspare is allocated.
     */
    if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
        (newstate & (CS_ERRED|CS_LAST_ERRED))) {

        get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
            "get device", 0);
        (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);

        err = md_getdevname(setno, mddb_getsidenum(setno), 0,
            cd.cd_dev, devname, sizeof (devname));

        if (err == ENOENT) {
            (void) md_devname(setno, cd.cd_dev, devname,
                sizeof (devname));
        }

        cmn_err(CE_WARN, "md: %s: %s needs maintenance",
            md_shortname(md_getminor(sm->sm_dev)), devname);

        if (newstate & CS_LAST_ERRED) {
            cmn_err(CE_WARN, "md: %s: %s last erred",
                md_shortname(md_getminor(sm->sm_dev)),
                devname);

        } else if (shared->ms_flags & MDM_S_ISOPEN) {
            /*
             * Close the broken device and clear the open flag on
             * it.  Closing the device means the RCM framework will
             * be able to unconfigure the device if required.
             *
             * We have to check that the device is open, otherwise
             * the first open on it has resulted in the error that
             * is being processed and the actual cd.cd_dev will be
             * NODEV64.
             *
             * If this is a multi-node mirror, then the multinode
             * state checks following this code will cause the
             * slave nodes to close the mirror in the function
             * mirror_set_state().
             */
            md_layered_close(cd.cd_dev, MD_OFLG_NULL);
            shared->ms_flags &= ~MDM_S_ISOPEN;
        }

    } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
        (shared->ms_flags & MDM_S_ISOPEN)) {
        /*
         * Similar to logic above except no log messages since we
         * are just transitioning from Last Erred to Erred.
         */
        get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
            "get device", 0);
        (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);

        md_layered_close(cd.cd_dev, MD_OFLG_NULL);
        shared->ms_flags &= ~MDM_S_ISOPEN;
    }

    if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
        (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
        /*
         * For a multi-node mirror, send the state change to the
         * master, which broadcasts to all nodes, including this
         * one. Once the message is received, the state is set
         * in-core and the master commits the change to disk.
         * There is a case, comp_replace,  where this function
         * can be called from within an ioctl and therefore in this
         * case, as the ioctl will already be called on each node,
         * there is no need to xmit the state change to the master for
         * distribution to the other nodes. MD_STATE_XMIT flag is used
         * to indicate whether a xmit is required. The mirror's
         * transient state is set to MD_ERR_PENDING to avoid sending
         * multiple messages.
         */
        if (newstate & (CS_ERRED|CS_LAST_ERRED))
            ui->ui_tstate |= MD_ERR_PENDING;

        /*
         * Send a state update message to all nodes. This message
         * will generate 2 submessages, the first one to suspend
         * all writes to the mirror and the second to update the
         * state and resume writes.
         */
        stchmsg.msg_stch_mnum = un->c.un_self_id;
        stchmsg.msg_stch_sm = smi;
        stchmsg.msg_stch_comp = ci;
        stchmsg.msg_stch_new_state = newstate;
        stchmsg.msg_stch_hs_id = shared->ms_hs_id;
#ifdef DEBUG
        if (mirror_debug_flag)
            printf("send set state, %x, %x, %x, %x, %x\n",
                stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
                stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
                stchmsg.msg_stch_hs_id);
#endif
        if (flags & MD_STATE_WMUPDATE) {
            msgtype  = MD_MN_MSG_STATE_UPDATE2;
            /*
             * When coming from an update of watermarks, there
             * must already be a message logged that triggered
             * this action. So, no need to log this message, too.
             */
            msgflags = MD_MSGF_NO_LOG;
        } else {
            msgtype  = MD_MN_MSG_STATE_UPDATE;
            msgflags = MD_MSGF_DEFAULT_FLAGS;
        }

        /*
         * If we are in the context of an ioctl, drop the ioctl lock.
         * lockp holds the list of locks held.
         *
         * Otherwise, increment the appropriate reacquire counters.
         * If openclose lock is *held, then must reacquire reader
         * lock before releasing the openclose lock.
         * Do not drop the ARRAY_WRITER lock as we may not be able
         * to reacquire it.
         */
        if (lockp) {
            if (lockp->l_flags & MD_ARRAY_WRITER) {
                save_lock = MD_ARRAY_WRITER;
                lockp->l_flags &= ~MD_ARRAY_WRITER;
            } else if (lockp->l_flags & MD_ARRAY_READER) {
                save_lock = MD_ARRAY_READER;
                lockp->l_flags &= ~MD_ARRAY_READER;
            }
            IOLOCK_RETURN_RELEASE(0, lockp);
        } else {
            if (flags & MD_STATE_OCHELD) {
                md_unit_writerexit(ui);
                (void) md_unit_readerlock(ui);
                md_unit_openclose_exit(ui);
            } else {
                md_unit_writerexit(ui);
            }
        }

        kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
sscs_msg:
        rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
            (char *)&stchmsg, sizeof (stchmsg), kresult);

        if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
            mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
            /* If we're shutting down already, pause things here. */
            if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
                while (!md_mn_is_commd_present()) {
                    delay(md_hz);
                }
                /*
                 * commd is now available; retry the message
                 * one time. If that fails we fall through and
                 * panic as the system is in an unexpected state
                 */
                if (nretries++ == 0)
                    goto sscs_msg;
            }
            cmn_err(CE_PANIC,
                "ksend_message failure: STATE_UPDATE");
        }
        kmem_free(kresult, sizeof (md_mn_kresult_t));

        /* if dropped the lock previously, regain it */
        if (lockp) {
            IOLOCK_RETURN_REACQUIRE(lockp);
            lockp->l_flags |= save_lock;
        } else {
            /*
             * Reacquire dropped locks and update acquirecnts
             * appropriately.
             */
            if (flags & MD_STATE_OCHELD) {
                /*
                 * openclose also grabs readerlock.
                 */
                (void) md_unit_openclose_enter(ui);
                md_unit_readerexit(ui);
                (void) md_unit_writerlock(ui);
            } else {
                (void) md_unit_writerlock(ui);
            }
        }

        ui->ui_tstate &= ~MD_ERR_PENDING;
    } else {
        shared->ms_state = newstate;
        uniqtime32(&shared->ms_timestamp);

        if (newstate == CS_ERRED)
            shared->ms_flags |= MDM_S_NOWRITE;
        else
            shared->ms_flags &= ~MDM_S_NOWRITE;

        shared->ms_flags &= ~MDM_S_IOERR;
        un->un_changecnt++;
        shared->ms_lasterrcnt = un->un_changecnt;

        mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
        mirror_commit(un, SMI2BIT(smi), extras);
    }

    if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
        /*
         * Resetting the Last Erred state will recursively call back
         * into this function (set_sm_comp_state) to update the state.
         */
        reset_lasterred(un, smi, extras, flags, lockp);
    }
}

static int
find_another_logical(
    mm_unit_t       *un,
    mm_submirror_t      *esm,
    diskaddr_t      blk,
    u_longlong_t        cnt,
    int         must_be_open,
    int         state,
    int         err_cnt)
{
    u_longlong_t    cando;
    md_dev64_t  dev;
    md_m_shared_t   *s;

    esm->sm_state |= SMS_IGNORE;
    while (cnt != 0) {
        u_longlong_t     mcnt;

        mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));    /* 1 Gig Blks */

        dev = select_read_unit(un, blk, mcnt, &cando,
            must_be_open, &s, NULL);
        if (dev == (md_dev64_t)0)
            break;

        if ((state == CS_LAST_ERRED) &&
            (s->ms_state == CS_LAST_ERRED) &&
            (err_cnt > s->ms_lasterrcnt))
            break;

        cnt -= cando;
        blk += cando;
    }
    esm->sm_state &= ~SMS_IGNORE;
    return (cnt != 0);
}

int
mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    size_t          count;
    diskaddr_t      block;
    u_longlong_t        skip;
    u_longlong_t        size;
    md_dev64_t      dev;
    int         cnt;
    md_m_shared_t       *s;
    int         not_found;

    sm = &un->un_sm[smi];
    smic = &un->un_smic[smi];
    dev = sm->sm_dev;

    /*
     * Make sure every component of the submirror
     * has other sources.
     */
    if (ci < 0) {
        /* Find the highest lasterrcnt */
        cnt = (*(smic->sm_get_component_count))(dev, sm);
        for (ci = 0; ci < cnt; ci++) {
            not_found = mirror_other_sources(un, smi, ci,
                must_be_open);
            if (not_found)
                return (1);
        }
        return (0);
    }

    /*
     * Make sure this component has other sources
     */
    (void) (*(smic->sm_get_bcss))
        (dev, sm, ci, &block, &count, &skip, &size);

    if (count == 0)
        return (1);

    s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);

    while (count--) {
        if (block >= un->c.un_total_blocks)
            return (0);

        if ((block + size) > un->c.un_total_blocks)
            size = un->c.un_total_blocks - block;

        not_found = find_another_logical(un, sm, block, size,
            must_be_open, s->ms_state, s->ms_lasterrcnt);
        if (not_found)
            return (1);

        block += size + skip;
    }
    return (0);
}

static void
finish_error(md_mps_t *ps)
{
    struct buf  *pb;
    mm_unit_t   *un;
    mdi_unit_t  *ui;
    uint_t      new_str_flags;

    pb = ps->ps_bp;
    un = ps->ps_un;
    ui = ps->ps_ui;

    /*
     * Must flag any error to the resync originator if we're performing
     * a Write-after-Read. This corresponds to an i/o error on a resync
     * target device and in this case we ought to abort the resync as there
     * is nothing that can be done to recover from this without operator
     * intervention. If we don't set the B_ERROR flag we will continue
     * reading from the mirror but won't write to the target (as it will
     * have been placed into an errored state).
     * To handle the case of multiple components within a submirror we only
     * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
     * The originator of the resync read will cause this bit to be set if
     * the underlying component count is one for a submirror resync. All
     * other resync types will have the flag set as there is no underlying
     * resync which can be performed on a contained metadevice for these
     * resync types (optimized or component).
     */

    if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
        if (ps->ps_flags & MD_MPS_FLAG_ERROR)
            pb->b_flags |= B_ERROR;
        md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
        MPS_FREE(mirror_parent_cache, ps);
        md_unit_readerexit(ui);
        md_biodone(pb);
        return;
    }
    /*
     * Set the MD_IO_COUNTED flag as we are retrying the same I/O
     * operation therefore this I/O request has already been counted,
     * the I/O count variable will be decremented by mirror_done()'s
     * call to md_biodone().
     */
    if (ps->ps_changecnt != un->un_changecnt) {
        new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
        if (ps->ps_flags & MD_MPS_WOW)
            new_str_flags |= MD_STR_WOW;
        if (ps->ps_flags & MD_MPS_MAPPED)
            new_str_flags |= MD_STR_MAPPED;
        /*
         * If this I/O request was a read that was part of a resync,
         * set MD_STR_WAR for the retried read to ensure that the
         * resync write (i.e. write-after-read) will be performed
         */
        if (ps->ps_flags & MD_MPS_RESYNC_READ)
            new_str_flags |= MD_STR_WAR;
        md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
        MPS_FREE(mirror_parent_cache, ps);
        md_unit_readerexit(ui);
        (void) md_mirror_strategy(pb, new_str_flags, NULL);
        return;
    }

    pb->b_flags |= B_ERROR;
    md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
    MPS_FREE(mirror_parent_cache, ps);
    md_unit_readerexit(ui);
    md_biodone(pb);
}

static void
error_update_unit(md_mps_t *ps)
{
    mm_unit_t       *un;
    mdi_unit_t      *ui;
    int         smi;    /* sub mirror index */
    int         ci; /* errored component */
    set_t           setno;
    uint_t          flags;  /* for set_sm_comp_state() */
    uint_t          hspflags; /* for check_comp_4_hotspares() */

    ui = ps->ps_ui;
    un = (mm_unit_t *)md_unit_writerlock(ui);
    setno = MD_UN2SET(un);

    /* All of these updates have to propagated in case of MN set */
    flags = MD_STATE_XMIT;
    hspflags = MD_HOTSPARE_XMIT;

    /* special treatment if we are called during updating watermarks */
    if (ps->ps_flags & MD_MPS_WMUPDATE) {
        flags |= MD_STATE_WMUPDATE;
        hspflags |= MD_HOTSPARE_WMUPDATE;
    }
    smi = 0;
    ci = 0;
    while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
        if (mirror_other_sources(un, smi, ci, 0) == 1) {

            /* Never called from ioctl context, so (IOLOCK *)NULL */
            set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
                (IOLOCK *)NULL);
            /*
             * For a MN set, the NOTIFY is done when the state
             * change is processed on each node
             */
            if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
                SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
                    SVM_TAG_METADEVICE, setno, MD_SID(un));
            }
            continue;
        }
        /* Never called from ioctl context, so (IOLOCK *)NULL */
        set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
            (IOLOCK *)NULL);
        /*
         * For a MN set, the NOTIFY is done when the state
         * change is processed on each node
         */
        if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
            SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
                SVM_TAG_METADEVICE, setno, MD_SID(un));
        }
        smi = 0;
        ci = 0;
    }

    md_unit_writerexit(ui);
    if (MD_MNSET_SETNO(setno)) {
        send_poke_hotspares(setno);
    } else {
        (void) poke_hotspares();
    }
    (void) md_unit_readerlock(ui);

    finish_error(ps);
}

/*
 * When we have a B_FAILFAST IO error on a Last Erred component we need to
 * retry the IO without B_FAILFAST set so that we try to ensure that the
 * component "sees" each IO.
 */
static void
last_err_retry(md_mcs_t *cs)
{
    struct buf  *cb;
    md_mps_t    *ps;
    uint_t      flags;

    cb = &cs->cs_buf;
    cb->b_flags &= ~B_FAILFAST;

    /* if we're panicing just let this I/O error out */
    if (panicstr) {
        (void) mirror_done(cb);
        return;
    }

    /* reissue the I/O */

    ps = cs->cs_ps;

    bioerror(cb, 0);

    mutex_enter(&ps->ps_mx);

    flags = MD_STR_NOTTOP;
    if (ps->ps_flags & MD_MPS_MAPPED)
        flags |= MD_STR_MAPPED;
    if (ps->ps_flags & MD_MPS_NOBLOCK)
        flags |= MD_NOBLOCK;

    mutex_exit(&ps->ps_mx);

    clear_retry_error(cb);

    cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
        md_shortname(getminor(cb->b_edev)));

    md_call_strategy(cb, flags, NULL);
}

static void
mirror_error(md_mps_t *ps)
{
    int     smi;    /* sub mirror index */
    int     ci; /* errored component */

    if (panicstr) {
        finish_error(ps);
        return;
    }

    if (ps->ps_flags & MD_MPS_ON_OVERLAP)
        mirror_overlap_tree_remove(ps);

    smi = 0;
    ci = 0;
    if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
        md_unit_readerexit(ps->ps_ui);
        daemon_request(&md_mstr_daemon, error_update_unit,
            (daemon_queue_t *)ps, REQ_OLD);
        return;
    }

    finish_error(ps);
}

static int
copy_write_done(struct buf *cb)
{
    md_mps_t    *ps;
    buf_t       *pb;
    char        *wowbuf;
    wowhdr_t    *wowhdr;
    ssize_t     wow_resid;

    /* get wowbuf ans save structure */
    wowbuf = cb->b_un.b_addr;
    wowhdr = WOWBUF_HDR(wowbuf);
    ps = wowhdr->wow_ps;
    pb = ps->ps_bp;

    /* Save error information, then free cb */
    if (cb->b_flags & B_ERROR)
        pb->b_flags |= B_ERROR;

    if (cb->b_flags & B_REMAPPED)
        bp_mapout(cb);

    freerbuf(cb);

    /* update residual and continue if needed */
    if ((pb->b_flags & B_ERROR) == 0) {
        wow_resid = pb->b_bcount - wowhdr->wow_offset;
        pb->b_resid = wow_resid;
        if (wow_resid > 0)  {
            daemon_request(&md_mstr_daemon, copy_write_cont,
                (daemon_queue_t *)wowhdr, REQ_OLD);
            return (1);
        }
    }

    /* Write is complete, release resources. */
    kmem_cache_free(mirror_wowblk_cache, wowhdr);
    ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
    md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
    MPS_FREE(mirror_parent_cache, ps);
    md_biodone(pb);
    return (0);
}

static void
copy_write_cont(wowhdr_t *wowhdr)
{
    buf_t       *pb;
    buf_t       *cb;
    char        *wowbuf;
    int     wow_offset;
    size_t      wow_resid;
    diskaddr_t  wow_blkno;

    wowbuf = WOWHDR_BUF(wowhdr);
    pb = wowhdr->wow_ps->ps_bp;

    /* get data on current location */
    wow_offset = wowhdr->wow_offset;
    wow_resid = pb->b_bcount - wow_offset;
    wow_blkno = pb->b_lblkno + lbtodb(wow_offset);

    /* setup child buffer */
    cb = getrbuf(KM_SLEEP);
    cb->b_flags = B_WRITE;
    cb->b_edev = pb->b_edev;
    cb->b_un.b_addr = wowbuf;   /* change to point at WOWBUF */
    cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
    cb->b_iodone = copy_write_done;
    cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
    cb->b_lblkno = wow_blkno;

    /* move offset to next section */
    wowhdr->wow_offset += cb->b_bcount;

    /* copy and setup write for current section */
    bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);

    /* do it */
    /*
     * Do not set the MD_IO_COUNTED flag as this is a new I/O request
     * that handles the WOW condition. The resultant increment on the
     * I/O count variable is cleared by copy_write_done()'s call to
     * md_biodone().
     */
    (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
        | MD_STR_MAPPED, NULL);
}

static void
md_mirror_copy_write(md_mps_t *ps)
{
    wowhdr_t    *wowhdr;

    wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
    mirror_wowblk_init(wowhdr);
    wowhdr->wow_ps = ps;
    wowhdr->wow_offset = 0;
    copy_write_cont(wowhdr);
}

static void
handle_wow(md_mps_t *ps)
{
    buf_t       *pb;

    pb = ps->ps_bp;

    bp_mapin(pb);

    md_mirror_wow_cnt++;
    if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
        cmn_err(CE_NOTE,
            "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
            md_shortname(getminor(pb->b_edev)),
            (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
    }

    /*
     * Set the MD_IO_COUNTED flag as we are retrying the same I/O
     * operation therefore this I/O request has already been counted,
     * the I/O count variable will be decremented by mirror_done()'s
     * call to md_biodone().
     */
    if (md_mirror_wow_flg & WOW_NOCOPY)
        (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
            MD_STR_MAPPED | MD_IO_COUNTED, ps);
    else
        md_mirror_copy_write(ps);
}

/*
 * Return true if the specified submirror is either in the Last Erred
 * state or is transitioning into the Last Erred state.
 */
static bool_t
submirror_is_lasterred(mm_unit_t *un, int smi)
{
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    md_m_shared_t       *shared;
    int         ci;
    int         compcnt;

    sm = &un->un_sm[smi];
    smic = &un->un_smic[smi];

    compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
    for (ci = 0; ci < compcnt; ci++) {
        shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
            (sm->sm_dev, sm, ci);

        if (shared->ms_state == CS_LAST_ERRED)
            return (B_TRUE);

        /*
         * It is not currently Last Erred, check if entering Last Erred.
         */
        if ((shared->ms_flags & MDM_S_IOERR) &&
            ((shared->ms_state == CS_OKAY) ||
            (shared->ms_state == CS_RESYNC))) {
            if (mirror_other_sources(un, smi, ci, 0) == 1)
                return (B_TRUE);
        }
    }

    return (B_FALSE);
}


static int
mirror_done(struct buf *cb)
{
    md_mps_t    *ps;
    md_mcs_t    *cs;

    /*LINTED*/
    cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
    ps = cs->cs_ps;

    mutex_enter(&ps->ps_mx);

    /* check if we need to retry an errored failfast I/O */
    if (cb->b_flags & B_ERROR) {
        struct buf *pb = ps->ps_bp;

        if (cb->b_flags & B_FAILFAST) {
            int     i;
            mm_unit_t   *un = ps->ps_un;

            for (i = 0; i < NMIRROR; i++) {
                if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
                    continue;

                if (cb->b_edev ==
                    md_dev64_to_dev(un->un_sm[i].sm_dev)) {

                    /*
                     * This is the submirror that had the
                     * error.  Check if it is Last Erred.
                     */
                    if (submirror_is_lasterred(un, i)) {
                        daemon_queue_t *dqp;

                        mutex_exit(&ps->ps_mx);
                        dqp = (daemon_queue_t *)cs;
                        dqp->dq_prev = NULL;
                        dqp->dq_next = NULL;
                        daemon_request(&md_done_daemon,
                            last_err_retry, dqp,
                            REQ_OLD);
                        return (1);
                    }
                    break;
                }
            }
        }

        /* continue to process the buf without doing a retry */
        ps->ps_flags |= MD_MPS_ERROR;
        pb->b_error = cb->b_error;
    }

    return (mirror_done_common(cb));
}

/*
 * Split from the original mirror_done function so we can handle bufs after a
 * retry.
 * ps->ps_mx is already held in the caller of this function and the cb error
 * has already been checked and handled in the caller.
 */
static int
mirror_done_common(struct buf *cb)
{
    struct buf  *pb;
    mm_unit_t   *un;
    mdi_unit_t  *ui;
    md_mps_t    *ps;
    md_mcs_t    *cs;
    size_t      end_rr, start_rr, current_rr;

    /*LINTED*/
    cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
    ps = cs->cs_ps;
    pb = ps->ps_bp;

    if (cb->b_flags & B_REMAPPED)
        bp_mapout(cb);

    ps->ps_frags--;
    if (ps->ps_frags != 0) {
        mutex_exit(&ps->ps_mx);
        kmem_cache_free(mirror_child_cache, cs);
        return (1);
    }
    un = ps->ps_un;
    ui = ps->ps_ui;

    /*
     * Do not update outstanding_writes if we're running with ABR
     * set for this mirror or the write() was issued with MD_STR_ABR set.
     * Also a resync initiated write() has no outstanding_writes update
     * either.
     */
    if (((cb->b_flags & B_READ) == 0) &&
        (un->un_nsm >= 2) &&
        (ps->ps_call == NULL) &&
        !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
        !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
        BLK_TO_RR(end_rr, ps->ps_lastblk, un);
        BLK_TO_RR(start_rr, ps->ps_firstblk, un);
        mutex_enter(&un->un_resync_mx);
        for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
            un->un_outstanding_writes[current_rr]--;
        mutex_exit(&un->un_resync_mx);
    }
    kmem_cache_free(mirror_child_cache, cs);
    mutex_exit(&ps->ps_mx);

    if (ps->ps_call != NULL) {
        daemon_request(&md_done_daemon, ps->ps_call,
            (daemon_queue_t *)ps, REQ_OLD);
        return (1);
    }

    if ((ps->ps_flags & MD_MPS_ERROR)) {
        daemon_request(&md_done_daemon, mirror_error,
            (daemon_queue_t *)ps, REQ_OLD);
        return (1);
    }

    if (ps->ps_flags & MD_MPS_ON_OVERLAP)
        mirror_overlap_tree_remove(ps);

    /*
     * Handle Write-on-Write problem.
     * Skip In case of Raw and Direct I/O as they are
     * handled earlier.
     *
     */
    if (!(md_mirror_wow_flg & WOW_DISABLE) &&
        !(pb->b_flags & B_READ) &&
        !(ps->ps_flags & MD_MPS_WOW) &&
        !(pb->b_flags & B_PHYS) &&
        any_pages_dirty(pb)) {
        md_unit_readerexit(ps->ps_ui);
        daemon_request(&md_mstr_daemon, handle_wow,
            (daemon_queue_t *)ps, REQ_OLD);
        return (1);
    }

    md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
    MPS_FREE(mirror_parent_cache, ps);
    md_unit_readerexit(ui);
    md_biodone(pb);
    return (0);
}

/*
 * Clear error state in submirror component if the retry worked after
 * a failfast error.
 */
static void
clear_retry_error(struct buf *cb)
{
    int         smi;
    md_mcs_t        *cs;
    mm_unit_t       *un;
    mdi_unit_t      *ui_sm;
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    u_longlong_t        cnt;
    md_m_shared_t       *shared;

    /*LINTED*/
    cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
    un = cs->cs_ps->ps_un;

    for (smi = 0; smi < NMIRROR; smi++) {
        if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
            continue;

        if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
            break;
    }

    if (smi >= NMIRROR)
        return;

    sm = &un->un_sm[smi];
    smic = &un->un_smic[smi];
    cnt = cb->b_bcount;

    ui_sm = MDI_UNIT(getminor(cb->b_edev));
    (void) md_unit_writerlock(ui_sm);

    shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
        cb->b_blkno, &cnt);

    if (shared->ms_flags & MDM_S_IOERR) {
        shared->ms_flags &= ~MDM_S_IOERR;

    } else {
        /* the buf spans components and the first one is not erred */
        int cnt;
        int i;

        cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
        for (i = 0; i < cnt; i++) {
            shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
                (sm->sm_dev, sm, i);

            if (shared->ms_flags & MDM_S_IOERR &&
                shared->ms_state == CS_OKAY) {

                shared->ms_flags &= ~MDM_S_IOERR;
                break;
            }
        }
    }

    md_unit_writerexit(ui_sm);
}

static size_t
mirror_map_read(
    md_mps_t *ps,
    md_mcs_t *cs,
    diskaddr_t blkno,
    u_longlong_t    count
)
{
    mm_unit_t   *un;
    buf_t       *bp;
    u_longlong_t    cando;

    bp = &cs->cs_buf;
    un = ps->ps_un;

    bp->b_lblkno = blkno;
    if (fast_select_read_unit(ps, cs) == 0) {
        bp->b_bcount = ldbtob(count);
        return (0);
    }
    bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
        count, &cando, 0, NULL, cs));
    bp->b_bcount = ldbtob(cando);
    if (count != cando)
        return (cando);
    return (0);
}

static void
write_after_read(md_mps_t *ps)
{
    struct buf  *pb;
    int     flags;

    if (ps->ps_flags & MD_MPS_ERROR) {
        mirror_error(ps);
        return;
    }

    pb = ps->ps_bp;
    md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
    ps->ps_call = NULL;
    ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
    flags = MD_STR_NOTTOP | MD_STR_WAR;
    if (ps->ps_flags & MD_MPS_MAPPED)
        flags |= MD_STR_MAPPED;
    if (ps->ps_flags & MD_MPS_NOBLOCK)
        flags |= MD_NOBLOCK;
    if (ps->ps_flags & MD_MPS_DIRTY_RD)
        flags |= MD_STR_DIRTY_RD;
    (void) mirror_write_strategy(pb, flags, ps);
}

static void
continue_serial(md_mps_t *ps)
{
    md_mcs_t    *cs;
    buf_t       *cb;
    mm_unit_t   *un;
    int     flags;

    un = ps->ps_un;
    cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
    mirror_child_init(cs);
    cb = &cs->cs_buf;
    ps->ps_call = NULL;
    ps->ps_frags = 1;
    (void) mirror_map_write(un, cs, ps, 0);
    flags = MD_STR_NOTTOP;
    if (ps->ps_flags & MD_MPS_MAPPED)
        flags |= MD_STR_MAPPED;
    md_call_strategy(cb, flags, NULL);
}

static int
mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
{
    int i;
    dev_t       dev;    /* needed for bioclone, so not md_dev64_t */
    buf_t       *cb;
    buf_t       *pb;
    diskaddr_t  blkno;
    size_t      bcount;
    off_t       offset;

    pb = ps->ps_bp;
    cb = &cs->cs_buf;
    cs->cs_ps = ps;

    i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);

    dev = md_dev64_to_dev(un->un_sm[i].sm_dev);

    blkno = pb->b_lblkno;
    bcount = pb->b_bcount;
    offset = 0;
    if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
        blkno = DK_LABEL_LOC + 1;
        /*
         * This handles the case where we're requesting
         * a write to block 0 on a label partition
         * and the request size was smaller than the
         * size of the label.  If this is the case
         * then we'll return -1.  Failure to do so will
         * either cause the calling thread to hang due to
         * an ssd bug, or worse if the bcount were allowed
         * to go negative (ie large).
         */
        if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
            return (-1);
        bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
        offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
    }

    cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
        cb, KM_NOSLEEP);
    if (war)
        cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;

    /*
     * If the submirror is in the erred stated, check if any component is
     * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
     * flag on the IO.
     *
     * Provide a fast path for the non-erred case (which should be the
     * normal case).
     */
    if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
        if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
            mm_submirror_t      *sm;
            mm_submirror_ic_t   *smic;
            int         ci;
            int         compcnt;

            sm = &un->un_sm[i];
            smic = &un->un_smic[i];

            compcnt = (*(smic->sm_get_component_count))
                (sm->sm_dev, un);
            for (ci = 0; ci < compcnt; ci++) {
                md_m_shared_t   *shared;

                shared = (md_m_shared_t *)
                    (*(smic->sm_shared_by_indx))(sm->sm_dev,
                    sm, ci);

                if (shared->ms_state == CS_LAST_ERRED)
                    break;
            }
            if (ci >= compcnt)
                cb->b_flags |= B_FAILFAST;

        } else {
            cb->b_flags |= B_FAILFAST;
        }
    }

    ps->ps_current_sm++;
    if (ps->ps_current_sm != ps->ps_active_cnt) {
        if (un->un_write_option == WR_SERIAL) {
            ps->ps_call = continue_serial;
            return (0);
        }
        return (1);
    }
    return (0);
}

/*
 * directed_read_done:
 * ------------------
 * Completion routine called when a DMR request has been returned from the
 * underlying driver. Wake-up the original ioctl() and return the data to
 * the user.
 */
static void
directed_read_done(md_mps_t *ps)
{
    mm_unit_t   *un;
    mdi_unit_t  *ui;

    un = ps->ps_un;
    ui = ps->ps_ui;

    md_unit_readerexit(ui);
    md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
    ps->ps_call = NULL;

    mutex_enter(&un->un_dmr_mx);
    cv_signal(&un->un_dmr_cv);
    mutex_exit(&un->un_dmr_mx);

    /* release the parent structure */
    kmem_cache_free(mirror_parent_cache, ps);
}

/*
 * daemon_io:
 * ------------
 * Called to issue a mirror_write_strategy() or mirror_read_strategy
 * call from a blockable context. NOTE: no mutex can be held on entry to this
 * routine
 */
static void
daemon_io(daemon_queue_t *dq)
{
    md_mps_t    *ps = (md_mps_t *)dq;
    int     flag = MD_STR_NOTTOP;
    buf_t       *pb = ps->ps_bp;

    if (ps->ps_flags & MD_MPS_MAPPED)
        flag |= MD_STR_MAPPED;
    if (ps->ps_flags & MD_MPS_WOW)
        flag |= MD_STR_WOW;
    if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
        flag |= MD_STR_WAR;
    if (ps->ps_flags & MD_MPS_ABR)
        flag |= MD_STR_ABR;
    if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
        flag |= MD_STR_BLOCK_OK;

    /*
     * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
     * MD_STR_WAR before calling mirror_read_strategy
     */
    if (pb->b_flags & B_READ) {
        if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
            flag |= MD_STR_WAR;
        mirror_read_strategy(pb, flag, ps);
    } else
        mirror_write_strategy(pb, flag, ps);
}

/*
 * update_resync:
 * -------------
 * Called to update the in-core version of the resync record with the latest
 * version that was committed to disk when the previous mirror owner
 * relinquished ownership. This call is likely to block as we must hold-off
 * any current resync processing that may be occurring.
 * On completion of the resync record update we issue the mirror_write_strategy
 * call to complete the i/o that first started this sequence. To remove a race
 * condition between a new write() request which is submitted and the resync
 * record update we acquire the writerlock. This will hold off all i/o to the
 * mirror until the resync update has completed.
 * NOTE: no mutex can be held on entry to this routine
 */
static void
update_resync(daemon_queue_t *dq)
{
    md_mps_t    *ps = (md_mps_t *)dq;
    buf_t       *pb = ps->ps_bp;
    mdi_unit_t  *ui = ps->ps_ui;
    mm_unit_t   *un = MD_UNIT(ui->ui_link.ln_id);
    set_t       setno;
    int     restart_resync;

    mutex_enter(&un->un_rrp_inflight_mx);
    (void) md_unit_writerlock(ui);
    ps->ps_un = un;
    setno = MD_MIN2SET(getminor(pb->b_edev));
    if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
        /*
         * Synchronize our in-core view of what regions need to be
         * resync'd with the on-disk version.
         */
        mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
            un->un_dirty_bm);

        /* Region dirty map is now up to date */
    }
    restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
    md_unit_writerexit(ui);
    mutex_exit(&un->un_rrp_inflight_mx);

    /* Restart the resync thread if it was previously blocked */
    if (restart_resync) {
        mutex_enter(&un->un_rs_thread_mx);
        un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
        cv_signal(&un->un_rs_thread_cv);
        mutex_exit(&un->un_rs_thread_mx);
    }
    /* Continue with original deferred i/o */
    daemon_io(dq);
}

/*
 * owner_timeout:
 * -------------
 * Called if the original mdmn_ksend_message() failed and the request is to be
 * retried. Reattempt the original ownership change.
 *
 * NOTE: called at interrupt context (see timeout(9f)).
 */
static void
owner_timeout(void *arg)
{
    daemon_queue_t  *dq = (daemon_queue_t *)arg;

    daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
}

/*
 * become_owner:
 * ------------
 * Called to issue RPC request to become the owner of the mirror
 * associated with this i/o request. We assume that the ownership request
 * is synchronous, so if it succeeds we will issue the request via
 * mirror_write_strategy().
 * If multiple i/o's are outstanding we will be called from the mirror_daemon
 * service thread.
 * NOTE: no mutex should be held on entry to this routine.
 */
static void
become_owner(daemon_queue_t *dq)
{
    md_mps_t    *ps = (md_mps_t *)dq;
    mm_unit_t   *un = ps->ps_un;
    buf_t       *pb = ps->ps_bp;
    set_t       setno;
    md_mn_kresult_t *kres;
    int     msg_flags = md_mirror_msg_flags;
    md_mps_t    *ps1;

    ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);

    /*
     * If we're already the mirror owner we do not need to send a message
     * but can simply process the i/o request immediately.
     * If we've already sent the request to become owner we requeue the
     * request as we're waiting for the synchronous ownership message to
     * be processed.
     */
    if (MD_MN_MIRROR_OWNER(un)) {
        /*
         * As the strategy() call will potentially block we need to
         * punt this to a separate thread and complete this request
         * as quickly as possible. Note: if we're a read request
         * this must be a resync, we cannot afford to be queued
         * behind any intervening i/o requests. In this case we put the
         * request on the md_mirror_rs_daemon queue.
         */
        if (pb->b_flags & B_READ) {
            daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
                REQ_OLD);
        } else {
            daemon_request(&md_mirror_io_daemon, daemon_io, dq,
                REQ_OLD);
        }
    } else {
        mutex_enter(&un->un_owner_mx);
        if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
            md_mn_req_owner_t   *msg;
            int         rval = 0;

            /*
             * Check to see that we haven't exceeded the maximum
             * retry count. If we have we fail the i/o as the
             * comms mechanism has become wedged beyond recovery.
             */
            if (dq->qlen++ >= MD_OWNER_RETRIES) {
                mutex_exit(&un->un_owner_mx);
                cmn_err(CE_WARN,
                    "md_mirror: Request exhausted ownership "
                    "retry limit of %d attempts", dq->qlen);
                pb->b_error = EIO;
                pb->b_flags |= B_ERROR;
                pb->b_resid = pb->b_bcount;
                kmem_cache_free(mirror_parent_cache, ps);
                md_biodone(pb);
                return;
            }

            /*
             * Issue request to change ownership. The call is
             * synchronous so when it returns we can complete the
             * i/o (if successful), or enqueue it again so that
             * the operation will be retried.
             */
            un->un_owner_state |= MM_MN_OWNER_SENT;
            mutex_exit(&un->un_owner_mx);

            msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
            setno = MD_MIN2SET(getminor(pb->b_edev));
            msg->mnum = MD_SID(un);
            msg->owner = md_mn_mynode_id;
            msg_flags |= MD_MSGF_NO_LOG;
            /*
             * If this IO is triggered by updating a watermark,
             * it might be issued by the creation of a softpartition
             * while the commd subsystem is suspended.
             * We don't want this message to block.
             */
            if (ps->ps_flags & MD_MPS_WMUPDATE) {
                msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
            }

            kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
            rval = mdmn_ksend_message(setno,
                MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
                (char *)msg, sizeof (md_mn_req_owner_t), kres);

            kmem_free(msg, sizeof (md_mn_req_owner_t));

            if (MDMN_KSEND_MSG_OK(rval, kres)) {
                dq->qlen = 0;
                /*
                 * Successfully changed owner, reread the
                 * resync record so that we have a valid idea of
                 * any previously committed incomplete write()s.
                 * NOTE: As we need to acquire the resync mutex
                 * this may block, so we defer it to a separate
                 * thread handler. This makes us (effectively)
                 * non-blocking once the ownership message
                 * handling has completed.
                 */
                mutex_enter(&un->un_owner_mx);
                if (un->un_owner_state & MM_MN_BECOME_OWNER) {
                    un->un_mirror_owner = md_mn_mynode_id;
                    /* Sets owner of un_rr_dirty record */
                    if (un->un_rr_dirty_recid)
                        (void) mddb_setowner(
                            un->un_rr_dirty_recid,
                            md_mn_mynode_id);
                    un->un_owner_state &=
                        ~MM_MN_BECOME_OWNER;
                    /*
                     * Release the block on the current
                     * resync region if it is blocked
                     */
                    ps1 = un->un_rs_prev_overlap;
                    if ((ps1 != NULL) &&
                        (ps1->ps_flags & MD_MPS_ON_OVERLAP))
                        mirror_overlap_tree_remove(ps1);
                    mutex_exit(&un->un_owner_mx);

                    /*
                     * If we're a read, this must be a
                     * resync request, issue
                     * the i/o request on the
                     * md_mirror_rs_daemon queue. This is
                     * to avoid a deadlock between the
                     * resync_unit thread and
                     * subsequent i/o requests that may
                     * block on the resync region.
                     */
                    if (pb->b_flags & B_READ) {
                        daemon_request(
                            &md_mirror_rs_daemon,
                            update_resync, dq, REQ_OLD);
                    } else {
                        daemon_request(
                            &md_mirror_io_daemon,
                            update_resync, dq, REQ_OLD);
                    }
                    kmem_free(kres,
                        sizeof (md_mn_kresult_t));
                    return;
                } else {
                    /*
                     * Some other node has beaten us to
                     * obtain ownership. We need to
                     * reschedule our ownership request
                     */
                    mutex_exit(&un->un_owner_mx);
                }
            } else {
                mdmn_ksend_show_error(rval, kres,
                    "MD_MN_MSG_REQUIRE_OWNER");
                /*
                 * Message transport failure is handled by the
                 * comms layer. If the ownership change request
                 * does not succeed we need to flag the error to
                 * the initiator of the i/o. This is handled by
                 * the retry logic above. As the request failed
                 * we do not know _who_ the owner of the mirror
                 * currently is. We reset our idea of the owner
                 * to None so that any further write()s will
                 * attempt to become the owner again. This stops
                 * multiple nodes writing to the same mirror
                 * simultaneously.
                 */
                mutex_enter(&un->un_owner_mx);
                un->un_owner_state &=
                    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
                un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
                mutex_exit(&un->un_owner_mx);
            }
            kmem_free(kres, sizeof (md_mn_kresult_t));
        } else
            mutex_exit(&un->un_owner_mx);

        /*
         * Re-enqueue this request on the deferred i/o list. Delay the
         * request for md_mirror_owner_to usecs to stop thrashing.
         */
        (void) timeout(owner_timeout, dq,
            drv_usectohz(md_mirror_owner_to));
    }
}

static void
mirror_write_strategy(buf_t *pb, int flag, void *private)
{
    md_mps_t    *ps;
    md_mcs_t    *cs;
    int     more;
    mm_unit_t   *un;
    mdi_unit_t  *ui;
    buf_t       *cb;        /* child buf pointer */
    set_t       setno;
    int     rs_on_overlap = 0;

    ui = MDI_UNIT(getminor(pb->b_edev));
    un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));


    md_kstat_waitq_enter(ui);

    /*
     * If a state change is in progress for this mirror in a MN set,
     * suspend all non-resync writes until the state change is complete.
     * The objective of this suspend is to ensure that it is not
     * possible for one node to read data from a submirror that another node
     * has not written to because of the state change. Therefore we
     * suspend all writes until the state change has been made. As it is
     * not possible to read from the target of a resync, there is no need
     * to suspend resync writes.
     * Note that we only block here if the caller can handle a busy-wait.
     * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
     */

    if (!(flag & MD_STR_WAR)) {
        if (flag & MD_STR_BLOCK_OK) {
            mutex_enter(&un->un_suspend_wr_mx);
            while (un->un_suspend_wr_flag) {
                cv_wait(&un->un_suspend_wr_cv,
                    &un->un_suspend_wr_mx);
            }
            mutex_exit(&un->un_suspend_wr_mx);
        }
        (void) md_unit_readerlock(ui);
    }

    if (!(flag & MD_STR_NOTTOP)) {
        if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
            md_kstat_waitq_exit(ui);
            return;
        }
    }

    setno = MD_MIN2SET(getminor(pb->b_edev));

    /* If an ABR write has been requested, set MD_STR_ABR flag */
    if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
        flag |= MD_STR_ABR;

    if (private == NULL) {
        ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
        mirror_parent_init(ps);
    } else {
        ps = private;
        private = NULL;
    }
    if (flag & MD_STR_MAPPED)
        ps->ps_flags |= MD_MPS_MAPPED;

    if (flag & MD_STR_WOW)
        ps->ps_flags |= MD_MPS_WOW;

    if (flag & MD_STR_ABR)
        ps->ps_flags |= MD_MPS_ABR;

    if (flag & MD_STR_WMUPDATE)
        ps->ps_flags |= MD_MPS_WMUPDATE;

    /*
     * Save essential information from the original buffhdr
     * in the md_save structure.
     */
    ps->ps_un = un;
    ps->ps_ui = ui;
    ps->ps_bp = pb;
    ps->ps_addr = pb->b_un.b_addr;
    ps->ps_firstblk = pb->b_lblkno;
    ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
    ps->ps_changecnt = un->un_changecnt;

    /*
     * Check for suspended writes here. This is where we can defer the
     * write request to the daemon_io queue which will then call us with
     * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
     * the top of this routine.
     */
    if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
        mutex_enter(&un->un_suspend_wr_mx);
        if (un->un_suspend_wr_flag) {
            ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
            mutex_exit(&un->un_suspend_wr_mx);
            md_unit_readerexit(ui);
            daemon_request(&md_mirror_daemon, daemon_io,
                (daemon_queue_t *)ps, REQ_OLD);
            return;
        }
        mutex_exit(&un->un_suspend_wr_mx);
    }

    /*
     * If not MN owner and this is an ABR write, make sure the current
     * resync region is in the overlaps tree
     */
    mutex_enter(&un->un_owner_mx);
    if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
        ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
        md_mps_t    *ps1;
        /* Block the current resync region, if not already blocked */
        ps1 = un->un_rs_prev_overlap;

        if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
            (ps1->ps_lastblk != 0))) {
            /* Drop locks to avoid deadlock */
            mutex_exit(&un->un_owner_mx);
            md_unit_readerexit(ui);
            wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
            rs_on_overlap = 1;
            (void) md_unit_readerlock(ui);
            mutex_enter(&un->un_owner_mx);
            /*
             * Check to see if we have obtained ownership
             * while waiting for overlaps. If we have, remove
             * the resync_region entry from the overlap tree
             */
            if (MD_MN_MIRROR_OWNER(un) &&
                (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
                mirror_overlap_tree_remove(ps1);
                rs_on_overlap = 0;
            }
        }
    }
    mutex_exit(&un->un_owner_mx);


    /*
     * following keep write after read from writing to the
     * source in the case where it all came from one place
     */
    if (flag & MD_STR_WAR) {
        int abort_write = 0;
        /*
         * We are perfoming a write-after-read. This is either as a
         * result of a resync read or as a result of a read in a
         * dirty resync region when the optimized resync is not
         * complete. If in a MN set and a resync generated i/o,
         * if the current block is not in the current
         * resync region terminate the write as another node must have
         * completed this resync region
         */
        if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
            (!(flag & MD_STR_DIRTY_RD))) {
            if (!IN_RESYNC_REGION(un, ps))
                abort_write = 1;
        }
        if ((select_write_after_read_units(un, ps) == 0) ||
            (abort_write)) {
#ifdef DEBUG
            if (mirror_debug_flag)
                printf("Abort resync write on %x, block %lld\n",
                    MD_SID(un), ps->ps_firstblk);
#endif
            if (ps->ps_flags & MD_MPS_ON_OVERLAP)
                mirror_overlap_tree_remove(ps);
            kmem_cache_free(mirror_parent_cache, ps);
            md_kstat_waitq_exit(ui);
            md_unit_readerexit(ui);
            md_biodone(pb);
            return;
        }
    } else {
        select_write_units(un, ps);

        /* Drop readerlock to avoid deadlock */
        md_unit_readerexit(ui);
        wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
        un = md_unit_readerlock(ui);
        /*
         * For a MN set with an ABR write, if we are now the
         * owner and we have a resync region in the overlap
         * tree, remove the entry from overlaps and retry the write.
         */

        if (MD_MNSET_SETNO(setno) &&
            ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
            mutex_enter(&un->un_owner_mx);
            if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
                mirror_overlap_tree_remove(ps);
                md_kstat_waitq_exit(ui);
                mutex_exit(&un->un_owner_mx);
                md_unit_readerexit(ui);
                daemon_request(&md_mirror_daemon, daemon_io,
                    (daemon_queue_t *)ps, REQ_OLD);
                return;
            }
            mutex_exit(&un->un_owner_mx);
        }
    }

    /*
     * For Multinode mirrors with no owner and a Resync Region (not ABR)
     * we need to become the mirror owner before continuing with the
     * write(). For ABR mirrors we check that we 'own' the resync if
     * we're in write-after-read mode. We do this _after_ ensuring that
     * there are no overlaps to ensure that once we know that we are
     * the owner, the readerlock will not be released until the write is
     * complete. As a change of ownership in a MN set requires the
     * writerlock, this ensures that ownership cannot be changed until
     * the write is complete.
     */
    if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
        (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
        if (MD_MN_NO_MIRROR_OWNER(un))  {
            if (ps->ps_flags & MD_MPS_ON_OVERLAP)
                mirror_overlap_tree_remove(ps);
            md_kstat_waitq_exit(ui);
            ASSERT(!(flag & MD_STR_WAR));
            md_unit_readerexit(ui);
            daemon_request(&md_mirror_daemon, become_owner,
                (daemon_queue_t *)ps, REQ_OLD);
            return;
        }
    }

    /*
     * Mark resync region if mirror has a Resync Region _and_ we are not
     * a resync initiated write(). Don't mark region if we're flagged as
     * an ABR write.
     */
    if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
        !(flag & MD_STR_WAR)) {
        if (mirror_mark_resync_region(un, ps->ps_firstblk,
            ps->ps_lastblk, md_mn_mynode_id)) {
            pb->b_flags |= B_ERROR;
            pb->b_resid = pb->b_bcount;
            if (ps->ps_flags & MD_MPS_ON_OVERLAP)
                mirror_overlap_tree_remove(ps);
            kmem_cache_free(mirror_parent_cache, ps);
            md_kstat_waitq_exit(ui);
            md_unit_readerexit(ui);
            md_biodone(pb);
            return;
        }
    }

    ps->ps_childbflags = pb->b_flags | B_WRITE;
    ps->ps_childbflags &= ~B_READ;
    if (flag & MD_STR_MAPPED)
        ps->ps_childbflags &= ~B_PAGEIO;

    if (!(flag & MD_STR_NOTTOP) && panicstr)
        /* Disable WOW and don't free ps */
        ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);

    md_kstat_waitq_to_runq(ui);

    /*
     * Treat Raw and Direct I/O as Write-on-Write always
     */

    if (!(md_mirror_wow_flg & WOW_DISABLE) &&
        (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
        (pb->b_flags & B_PHYS) &&
        !(ps->ps_flags & MD_MPS_WOW)) {
        if (ps->ps_flags & MD_MPS_ON_OVERLAP)
            mirror_overlap_tree_remove(ps);
        md_unit_readerexit(ui);
        daemon_request(&md_mstr_daemon, handle_wow,
            (daemon_queue_t *)ps, REQ_OLD);
        return;
    }

    ps->ps_frags = 1;
    do {
        cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
        mirror_child_init(cs);
        cb = &cs->cs_buf;
        more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));

        /*
         * This handles the case where we're requesting
         * a write to block 0 on a label partition.  (more < 0)
         * means that the request size was smaller than the
         * size of the label.  If so this request is done.
         */
        if (more < 0) {
            if (ps->ps_flags & MD_MPS_ON_OVERLAP)
                mirror_overlap_tree_remove(ps);
            md_kstat_runq_exit(ui);
            kmem_cache_free(mirror_child_cache, cs);
            kmem_cache_free(mirror_parent_cache, ps);
            md_unit_readerexit(ui);
            md_biodone(pb);
            return;
        }
        if (more) {
            mutex_enter(&ps->ps_mx);
            ps->ps_frags++;
            mutex_exit(&ps->ps_mx);
        }
        md_call_strategy(cb, flag, private);
    } while (more);

    if (!(flag & MD_STR_NOTTOP) && panicstr) {
        while (!(ps->ps_flags & MD_MPS_DONE)) {
            md_daemon(1, &md_done_daemon);
            drv_usecwait(10);
        }
        kmem_cache_free(mirror_parent_cache, ps);
    }
}

static void
mirror_read_strategy(buf_t *pb, int flag, void *private)
{
    md_mps_t    *ps;
    md_mcs_t    *cs;
    size_t      more;
    mm_unit_t   *un;
    mdi_unit_t  *ui;
    size_t      current_count;
    diskaddr_t  current_blkno;
    off_t       current_offset;
    buf_t       *cb;        /* child buf pointer */
    set_t       setno;

    ui = MDI_UNIT(getminor(pb->b_edev));

    md_kstat_waitq_enter(ui);

    un = (mm_unit_t *)md_unit_readerlock(ui);

    if (!(flag & MD_STR_NOTTOP)) {
        if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
            md_kstat_waitq_exit(ui);
            return;
        }
    }

    if (private == NULL) {
        ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
        mirror_parent_init(ps);
    } else {
        ps = private;
        private = NULL;
    }

    if (flag & MD_STR_MAPPED)
        ps->ps_flags |= MD_MPS_MAPPED;
    if (flag & MD_NOBLOCK)
        ps->ps_flags |= MD_MPS_NOBLOCK;
    if (flag & MD_STR_WMUPDATE)
        ps->ps_flags |= MD_MPS_WMUPDATE;

    /*
     * Check to see if this is a DMR driven read. If so we need to use the
     * specified side (in un->un_dmr_last_read) for the source of the data.
     */
    if (flag & MD_STR_DMR)
        ps->ps_flags |= MD_MPS_DMR;

    /*
     * Save essential information from the original buffhdr
     * in the md_save structure.
     */
    ps->ps_un = un;
    ps->ps_ui = ui;
    ps->ps_bp = pb;
    ps->ps_addr = pb->b_un.b_addr;
    ps->ps_firstblk = pb->b_lblkno;
    ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
    ps->ps_changecnt = un->un_changecnt;

    current_count = btodb(pb->b_bcount);
    current_blkno = pb->b_lblkno;
    current_offset = 0;

    /*
     * If flag has MD_STR_WAR set this means that the read is issued by a
     * resync thread which may or may not be an optimised resync.
     *
     * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
     * code has not completed; either a resync has not started since snarf,
     * or there is an optimized resync in progress.
     *
     * We need to generate a write after this read in the following two
     * cases,
     *
     * 1. Any Resync-Generated read
     *
     * 2. Any read to a DIRTY REGION if there is an optimized resync
     *    pending or in progress.
     *
     * The write after read is done in these cases to ensure that all sides
     * of the mirror are in sync with the read data and that it is not
     * possible for an application to read the same block multiple times
     * and get different data.
     *
     * This would be possible if the block was in a dirty region.
     *
     * If we're performing a directed read we don't write the data out as
     * the application is responsible for restoring the mirror to a known
     * state.
     */
    if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
        !(flag & MD_STR_DMR)) {
        size_t  start_rr, i, end_rr;
        int region_dirty = 1;

        /*
         * We enter here under three circumstances,
         *
         * MD_UN_OPT_NOT_DONE   MD_STR_WAR
         * 0            1
         * 1            0
         * 1            1
         *
         * To be optimal we only care to explicitly check for dirty
         * regions in the second case since if MD_STR_WAR is set we
         * always do the write after read.
         */
        if (!(flag & MD_STR_WAR)) {
            BLK_TO_RR(end_rr, ps->ps_lastblk, un);
            BLK_TO_RR(start_rr, ps->ps_firstblk, un);

            for (i = start_rr; i <= end_rr; i++)
                if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
                    break;
        }

        if ((region_dirty) &&
            !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
            ps->ps_call = write_after_read;
            /*
             * Mark this as a RESYNC_READ in ps_flags.
             * This is used if the read fails during a
             * resync of a 3-way mirror to ensure that
             * the retried read to the remaining
             * good submirror has MD_STR_WAR set. This
             * is needed to ensure that the resync write
             * (write-after-read) takes place.
             */
            ps->ps_flags |= MD_MPS_RESYNC_READ;

            /*
             * If MD_STR_FLAG_ERR is set in the flags we
             * set MD_MPS_FLAG_ERROR so that an error on the resync
             * write (issued by write_after_read) will be flagged
             * to the biowait'ing resync thread. This allows us to
             * avoid issuing further resync requests to a device
             * that has had a write failure.
             */
            if (flag & MD_STR_FLAG_ERR)
                ps->ps_flags |= MD_MPS_FLAG_ERROR;

            setno = MD_UN2SET(un);
            /*
             * Drop the readerlock to avoid
             * deadlock
             */
            md_unit_readerexit(ui);
            wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
            un = md_unit_readerlock(ui);
            /*
             * Ensure that we are owner
             */
            if (MD_MNSET_SETNO(setno)) {
                /*
                 * For a non-resync read that requires a
                 * write-after-read to be done, set a flag
                 * in the parent structure, so that the
                 * write_strategy routine can omit the
                 * test that the write is still within the
                 * resync region
                 */
                if (!(flag & MD_STR_WAR))
                    ps->ps_flags |= MD_MPS_DIRTY_RD;

                /*
                 * Before reading the buffer, see if
                 * there is an owner.
                 */
                if (MD_MN_NO_MIRROR_OWNER(un))  {
                    ps->ps_call = NULL;
                    mirror_overlap_tree_remove(ps);
                    md_kstat_waitq_exit(ui);
                    md_unit_readerexit(ui);
                    daemon_request(
                        &md_mirror_daemon,
                        become_owner,
                        (daemon_queue_t *)ps,
                        REQ_OLD);
                    return;
                }
                /*
                 * For a resync read, check to see if I/O is
                 * outside of the current resync region, or
                 * the resync has finished. If so
                 * just terminate the I/O
                 */
                if ((flag & MD_STR_WAR) &&
                    (!(un->c.un_status & MD_UN_WAR) ||
                    (!IN_RESYNC_REGION(un, ps)))) {
#ifdef DEBUG
                    if (mirror_debug_flag)
                        printf("Abort resync read "
                            "%x: %lld\n",
                            MD_SID(un),
                            ps->ps_firstblk);
#endif
                    mirror_overlap_tree_remove(ps);
                    kmem_cache_free(mirror_parent_cache,
                        ps);
                    md_kstat_waitq_exit(ui);
                    md_unit_readerexit(ui);
                    md_biodone(pb);
                    return;
                }
            }
        }
    }

    if (flag & MD_STR_DMR) {
        ps->ps_call = directed_read_done;
    }

    if (!(flag & MD_STR_NOTTOP) && panicstr)
        ps->ps_flags |= MD_MPS_DONTFREE;

    md_kstat_waitq_to_runq(ui);

    ps->ps_frags++;
    do {
        cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
        mirror_child_init(cs);
        cb = &cs->cs_buf;
        cs->cs_ps = ps;

        cb = md_bioclone(pb, current_offset, current_count, NODEV,
            current_blkno, mirror_done, cb, KM_NOSLEEP);

        more = mirror_map_read(ps, cs, current_blkno,
            (u_longlong_t)current_count);
        if (more) {
            mutex_enter(&ps->ps_mx);
            ps->ps_frags++;
            mutex_exit(&ps->ps_mx);
        }

        /*
         * Do these calculations now,
         *  so that we pickup a valid b_bcount from the chld_bp.
         */
        current_count -= more;
        current_offset += cb->b_bcount;
        current_blkno +=  more;
        md_call_strategy(cb, flag, private);
    } while (more);

    if (!(flag & MD_STR_NOTTOP) && panicstr) {
        while (!(ps->ps_flags & MD_MPS_DONE)) {
            md_daemon(1, &md_done_daemon);
            drv_usecwait(10);
        }
        kmem_cache_free(mirror_parent_cache, ps);
    }
}

void
md_mirror_strategy(buf_t *bp, int flag, void *private)
{
    set_t   setno = MD_MIN2SET(getminor(bp->b_edev));

    /*
     * When doing IO to a multi owner meta device, check if set is halted.
     * We do this check without the needed lock held, for performance
     * reasons.
     * If an IO just slips through while the set is locked via an
     * MD_MN_SUSPEND_SET, we don't care about it.
     * Only check for suspension if we are a top-level i/o request
     * (MD_STR_NOTTOP is cleared in 'flag').
     */
    if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
        (MD_SET_HALTED | MD_SET_MNSET)) {
        if ((flag & MD_STR_NOTTOP) == 0) {
            mutex_enter(&md_mx);
            /* Here we loop until the set is no longer halted */
            while (md_set[setno].s_status & MD_SET_HALTED) {
                cv_wait(&md_cv, &md_mx);
            }
            mutex_exit(&md_mx);
        }
    }

    if ((flag & MD_IO_COUNTED) == 0) {
        if ((flag & MD_NOBLOCK) == 0) {
            if (md_inc_iocount(setno) != 0) {
                bp->b_flags |= B_ERROR;
                bp->b_error = ENXIO;
                bp->b_resid = bp->b_bcount;
                biodone(bp);
                return;
            }
        } else {
            md_inc_iocount_noblock(setno);
        }
    }

    if (bp->b_flags & B_READ)
        mirror_read_strategy(bp, flag, private);
    else
        mirror_write_strategy(bp, flag, private);
}

/*
 * mirror_directed_read:
 * --------------------
 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
 * so that the application can determine what (if any) resync needs to be
 * performed. The data is copied out to the user-supplied buffer.
 *
 * Parameters:
 *  mdev    - dev_t for the mirror device
 *  vdr - directed read parameters specifying location and submirror
 *        to perform the read from
 *  mode    - used to ddi_copyout() any resulting data from the read
 *
 * Returns:
 *  0   success
 *  !0  error code
 *      EINVAL - invalid request format
 */
int
mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
{
    buf_t       *bp;
    minor_t     mnum = getminor(mdev);
    mdi_unit_t  *ui = MDI_UNIT(mnum);
    mm_unit_t   *un;
    mm_submirror_t  *sm;
    char        *sm_nm;
    uint_t      next_side;
    void        *kbuffer;

    if (ui == NULL)
        return (ENXIO);

    if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
        return (EINVAL);
    }

    /* Check for aligned block access. We disallow non-aligned requests. */
    if (vdr->vdr_offset % DEV_BSIZE) {
        return (EINVAL);
    }

    /*
     * Allocate kernel buffer for target of read(). If we had a reliable
     * (sorry functional) DDI this wouldn't be needed.
     */
    kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
    if (kbuffer == NULL) {
        cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
            " bytes\n", vdr->vdr_nbytes);
        return (ENOMEM);
    }

    bp = getrbuf(KM_SLEEP);

    bp->b_un.b_addr = kbuffer;
    bp->b_flags = B_READ;
    bp->b_bcount = vdr->vdr_nbytes;
    bp->b_lblkno = lbtodb(vdr->vdr_offset);
    bp->b_edev = mdev;

    un = md_unit_readerlock(ui);

    /*
     * If DKV_SIDE_INIT is set we need to determine the first available
     * side to start reading from. If it isn't set we increment to the
     * next readable submirror.
     * If there are no readable submirrors we error out with DKV_DMR_ERROR.
     * Note: we check for a readable submirror on completion of the i/o so
     * we should _always_ have one available. If this becomes unavailable
     * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
     * a metadetach is made between the completion of one DKIOCDMR ioctl
     * and the start of the next (i.e. a sys-admin 'accident' occurred).
     * The chance of this is small, but not non-existent.
     */
    if (vdr->vdr_side == DKV_SIDE_INIT) {
        next_side = 0;
    } else {
        next_side = vdr->vdr_side + 1;
    }
    while ((next_side < NMIRROR) &&
        !SUBMIRROR_IS_READABLE(un, next_side))
        next_side++;
    if (next_side >= NMIRROR) {
        vdr->vdr_flags |= DKV_DMR_ERROR;
        freerbuf(bp);
        vdr->vdr_bytesread = 0;
        md_unit_readerexit(ui);
        return (0);
    }

    /* Set the side to read from */
    un->un_dmr_last_read = next_side;

    md_unit_readerexit(ui);

    /*
     * Save timestamp for verification purposes. Can be read by debugger
     * to verify that this ioctl has been executed and to find the number
     * of DMR reads and the time of the last DMR read.
     */
    uniqtime(&mirror_dmr_stats.dmr_timestamp);
    mirror_dmr_stats.dmr_count++;

    /* Issue READ request and wait for completion */
    mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);

    mutex_enter(&un->un_dmr_mx);
    cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
    mutex_exit(&un->un_dmr_mx);

    /*
     * Check to see if we encountered an error during the read. If so we
     * can make no guarantee about any possibly returned data.
     */
    if ((bp->b_flags & B_ERROR) == 0) {
        vdr->vdr_flags &= ~DKV_DMR_ERROR;
        if (bp->b_resid) {
            vdr->vdr_flags |= DKV_DMR_SHORT;
            vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
        } else {
            vdr->vdr_flags |= DKV_DMR_SUCCESS;
            vdr->vdr_bytesread = vdr->vdr_nbytes;
        }
        /* Copy the data read back out to the user supplied buffer */
        if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
            mode)) {
            kmem_free(kbuffer, vdr->vdr_nbytes);
            return (EFAULT);
        }

    } else {
        /* Error out with DKV_DMR_ERROR */
        vdr->vdr_flags |= DKV_DMR_ERROR;
        vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
    }
    /*
     * Update the DMR parameters with the side and name of submirror that
     * we have just read from (un->un_dmr_last_read)
     */
    un = md_unit_readerlock(ui);

    vdr->vdr_side = un->un_dmr_last_read;
    sm = &un->un_sm[un->un_dmr_last_read];
    sm_nm = md_shortname(md_getminor(sm->sm_dev));

    (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));

    /*
     * Determine if we've completed the read cycle. This is true iff the
     * next computed submirror (side) equals or exceeds NMIRROR. We cannot
     * use un_nsm as we need to handle a sparse array of submirrors (which
     * can occur if a submirror is metadetached).
     */
    next_side = un->un_dmr_last_read + 1;
    while ((next_side < NMIRROR) &&
        !SUBMIRROR_IS_READABLE(un, next_side))
        next_side++;
    if (next_side >= NMIRROR) {
        /* We've finished */
        vdr->vdr_flags |= DKV_DMR_DONE;
    }

    md_unit_readerexit(ui);
    freerbuf(bp);
    kmem_free(kbuffer, vdr->vdr_nbytes);

    return (0);
}

/*
 * mirror_resync_message:
 * ---------------------
 * Handle the multi-node resync messages that keep all nodes within a given
 * disk-set in sync with their view of a mirror's resync status.
 *
 * The message types dealt with are:
 * MD_MN_MSG_RESYNC_STARTING    - start a resync thread for a unit
 * MD_MN_MSG_RESYNC_NEXT    - specified next region to be resynced
 * MD_MN_MSG_RESYNC_FINISH  - stop the resync thread for a unit
 * MD_MN_MSG_RESYNC_PHASE_DONE  - end of a resync phase, opt, submirror or comp
 *
 * Returns:
 *  0   Success
 *  >0  Failure error number
 */
int
mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
{
    mdi_unit_t      *ui;
    mm_unit_t       *un;
    set_t           setno;
    int         is_ABR;
    int         smi;
    int         ci;
    sm_state_t      state;
    int         broke_out;
    mm_submirror_t      *sm;
    mm_submirror_ic_t   *smic;
    md_m_shared_t       *shared;
    md_error_t      mde = mdnullerror;
    md_mps_t        *ps;
    int         rs_active;
    int         rr, rr_start, rr_end;

    /* Check that the given device is part of a multi-node set */
    setno = MD_MIN2SET(p->mnum);
    if (setno >= md_nsets) {
        return (ENXIO);
    }
    if (!MD_MNSET_SETNO(setno)) {
        return (EINVAL);
    }

    if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
        return (EINVAL);
    if ((ui = MDI_UNIT(p->mnum)) == NULL)
        return (EINVAL);
    is_ABR = (ui->ui_tstate & MD_ABR_CAP);

    /* Obtain the current resync status */
    (void) md_ioctl_readerlock(lockp, ui);
    rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
    md_ioctl_readerexit(lockp);

    switch ((md_mn_msgtype_t)p->msg_type) {
    case MD_MN_MSG_RESYNC_STARTING:
        /* Start the resync thread for the mirror */
        (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
        break;

    case MD_MN_MSG_RESYNC_NEXT:
        /*
         * We have to release any previously marked overlap regions
         * so that i/o can resume. Then we need to block the region
         * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
         * Update un_rs_resync_done and un_rs_resync_2_do.
         */
        (void) md_ioctl_readerlock(lockp, ui);
        /*
         * Ignore the message if there is no active resync thread or
         * if it is for a resync type that we have already completed.
         * un_resync_completed is set to the last resync completed
         * when processing a PHASE_DONE message.
         */
        if (!rs_active || (p->rs_type == un->un_resync_completed))
            break;
        /*
         * If this message is for the same resync and is for an earlier
         * resync region, just ignore it. This can only occur if this
         * node has progressed on to the next resync region before
         * we receive this message. This can occur if the class for
         * this message is busy and the originator has to retry thus
         * allowing this node to move onto the next resync_region.
         */
        if ((p->rs_type == un->un_rs_type) &&
            (p->rs_start < un->un_resync_startbl))
            break;
        ps = un->un_rs_prev_overlap;

        /* Allocate previous overlap reference if needed */
        if (ps == NULL) {
            ps = kmem_cache_alloc(mirror_parent_cache,
                MD_ALLOCFLAGS);
            ps->ps_un = un;
            ps->ps_ui = ui;
            ps->ps_firstblk = 0;
            ps->ps_lastblk = 0;
            ps->ps_flags = 0;
            md_ioctl_readerexit(lockp);
            (void) md_ioctl_writerlock(lockp, ui);
            un->un_rs_prev_overlap = ps;
            md_ioctl_writerexit(lockp);
        } else
            md_ioctl_readerexit(lockp);

        if (p->rs_originator != md_mn_mynode_id) {
            /*
             * Clear our un_resync_bm for the regions completed.
             * The owner (originator) will take care of itself.
             */
            BLK_TO_RR(rr_end, ps->ps_lastblk, un);
            BLK_TO_RR(rr_start, p->rs_start, un);
            if (ps->ps_lastblk && rr_end < rr_start) {
                BLK_TO_RR(rr_start, ps->ps_firstblk, un);
                mutex_enter(&un->un_resync_mx);
                /*
                 * Update our resync bitmap to reflect that
                 * another node has synchronized this range.
                 */
                for (rr = rr_start; rr <= rr_end; rr++) {
                    CLR_KEEPDIRTY(rr, un);
                }
                mutex_exit(&un->un_resync_mx);
            }

            /*
             * On all but the originating node, first update
             * the resync state, then unblock the previous
             * region and block the next one. No need
             * to do this if the region is already blocked.
             * Update the submirror state and flags from the
             * originator. This keeps the cluster in sync with
             * regards to the resync status.
             */

            (void) md_ioctl_writerlock(lockp, ui);
            un->un_rs_resync_done = p->rs_done;
            un->un_rs_resync_2_do = p->rs_2_do;
            un->un_rs_type = p->rs_type;
            un->un_resync_startbl = p->rs_start;
            md_ioctl_writerexit(lockp);
            /*
             * Use un_owner_mx to ensure that an ownership change
             * cannot happen at the same time as this message
             */
            mutex_enter(&un->un_owner_mx);
            if (MD_MN_MIRROR_OWNER(un)) {
                ps->ps_firstblk = p->rs_start;
                ps->ps_lastblk = ps->ps_firstblk +
                    p->rs_size - 1;
            } else {
                if ((ps->ps_firstblk != p->rs_start) ||
                    (ps->ps_lastblk != p->rs_start +
                    p->rs_size - 1)) {
                    /* Remove previous overlap range */
                    if (ps->ps_flags & MD_MPS_ON_OVERLAP)
                        mirror_overlap_tree_remove(ps);

                    ps->ps_firstblk = p->rs_start;
                    ps->ps_lastblk = ps->ps_firstblk +
                        p->rs_size - 1;

                    mutex_exit(&un->un_owner_mx);
                    /* Block this range from all i/o. */
                    if (ps->ps_firstblk != 0 ||
                        ps->ps_lastblk != 0)
                        wait_for_overlaps(ps,
                            MD_OVERLAP_ALLOW_REPEAT);
                    mutex_enter(&un->un_owner_mx);
                    /*
                     * Check to see if we have obtained
                     * ownership while waiting for
                     * overlaps. If we have, remove
                     * the resync_region entry from the
                     * overlap tree
                     */
                    if (MD_MN_MIRROR_OWNER(un) &&
                        (ps->ps_flags & MD_MPS_ON_OVERLAP))
                        mirror_overlap_tree_remove(ps);
                }
            }
            mutex_exit(&un->un_owner_mx);

            /*
             * If this is the first RESYNC_NEXT message (i.e.
             * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
             * issue RESYNC_START NOTIFY event
             */
            if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
                SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
                    SVM_TAG_METADEVICE, MD_UN2SET(un),
                    MD_SID(un));
            }

            /* Ensure that our local resync thread is running */
            if (un->un_rs_thread == NULL) {
                (void) mirror_resync_unit(p->mnum, NULL,
                    &p->mde, lockp);
            }
        }

        break;
    case MD_MN_MSG_RESYNC_FINISH:
        /*
         * Complete the resync by stopping the resync thread.
         * Also release the previous overlap region field.
         * Update the resync_progress_thread by cv_signal'ing it so
         * that we mark the end of the resync as soon as possible. This
         * stops an unnecessary delay should be panic after resync
         * completion.
         */
#ifdef DEBUG
        if (!rs_active) {
            if (mirror_debug_flag)
                printf("RESYNC_FINISH (mnum = %x), "
                    "Resync *NOT* active",
                    p->mnum);
        }
#endif

        if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
            (p->rs_originator != md_mn_mynode_id)) {
            mutex_enter(&un->un_rs_thread_mx);
            un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
            un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
            un->un_rs_thread_flags &=
                ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
            cv_signal(&un->un_rs_thread_cv);
            mutex_exit(&un->un_rs_thread_mx);
        }
        if (is_ABR) {
            /* Resync finished, if ABR set owner to NULL */
            mutex_enter(&un->un_owner_mx);
            un->un_mirror_owner = 0;
            mutex_exit(&un->un_owner_mx);
        }
        (void) md_ioctl_writerlock(lockp, ui);
        ps = un->un_rs_prev_overlap;
        if (ps != NULL) {
            /* Remove previous overlap range */
            if (ps->ps_flags & MD_MPS_ON_OVERLAP)
                mirror_overlap_tree_remove(ps);
            /*
             * Release the overlap range reference
             */
            un->un_rs_prev_overlap = NULL;
            kmem_cache_free(mirror_parent_cache,
                ps);
        }
        md_ioctl_writerexit(lockp);

        /* Mark the resync as complete in the metadb */
        un->un_rs_resync_done = p->rs_done;
        un->un_rs_resync_2_do = p->rs_2_do;
        un->un_rs_type = p->rs_type;
        mutex_enter(&un->un_rs_progress_mx);
        cv_signal(&un->un_rs_progress_cv);
        mutex_exit(&un->un_rs_progress_mx);

        un = md_ioctl_writerlock(lockp, ui);
        un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
        /* Deal with any pending grow_unit */
        if (un->c.un_status & MD_UN_GROW_PENDING) {
            if ((mirror_grow_unit(un, &mde) != 0) ||
                (! mdismderror(&mde, MDE_GROW_DELAYED))) {
                un->c.un_status &= ~MD_UN_GROW_PENDING;
            }
        }
        md_ioctl_writerexit(lockp);
        break;

    case MD_MN_MSG_RESYNC_PHASE_DONE:
        /*
         * A phase of the resync, optimized. component or
         * submirror is complete. Update mirror status.
         * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
         * mirror owner is peforming a resync. If we have just snarfed
         * this set, then we must clear any of the flags set at snarf
         * time by unit_setup_resync().
         * Note that unit_setup_resync() sets up these flags to
         * indicate that an optimized resync is required. These flags
         * need to be reset because if we get here,  the mirror owner
         * will have handled the optimized resync.
         * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
         * MD_UN_WAR. In addition, for each submirror,
         * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
         * set to SMS_OFFLINE.
         */
#ifdef DEBUG
        if (mirror_debug_flag)
            printf("phase done mess received from %d, mnum=%x,"
                "type=%x, flags=%x\n", p->rs_originator, p->mnum,
                p->rs_type, p->rs_flags);
#endif
        /*
         * Ignore the message if there is no active resync thread.
         */
        if (!rs_active)
            break;

        broke_out = p->rs_flags & MD_MN_RS_ERR;
        switch (RS_TYPE(p->rs_type)) {
        case MD_RS_OPTIMIZED:
            un = md_ioctl_writerlock(lockp, ui);
            if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
                /* If we are originator, just clear rs_type */
                if (p->rs_originator == md_mn_mynode_id) {
                    SET_RS_TYPE_NONE(un->un_rs_type);
                    md_ioctl_writerexit(lockp);
                    break;
                }
                /*
                 * If CLEAR_OPT_NOT_DONE is set, only clear the
                 * flags if OPT_NOT_DONE is set *and* rs_type
                 * is MD_RS_NONE.
                 */
                if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
                    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
                    /* No resync in progress */
                    un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
                    un->c.un_status &= ~MD_UN_WAR;
                } else {
                    /*
                     * We are in the middle of an
                     * optimized resync and this message
                     * should be ignored.
                     */
                    md_ioctl_writerexit(lockp);
                    break;
                }
            } else {
                /*
                 * This is the end of an optimized resync,
                 * clear the OPT_NOT_DONE and OFFLINE_SM flags
                 */

                un->c.un_status &= ~MD_UN_KEEP_DIRTY;
                if (!broke_out)
                    un->c.un_status &= ~MD_UN_WAR;

                /*
                 * Clear our un_resync_bm for the regions
                 * completed.  The owner (originator) will
                 * take care of itself.
                 */
                if (p->rs_originator != md_mn_mynode_id &&
                    (ps = un->un_rs_prev_overlap) != NULL) {
                    BLK_TO_RR(rr_start, ps->ps_firstblk,
                        un);
                    BLK_TO_RR(rr_end, ps->ps_lastblk, un);
                    mutex_enter(&un->un_resync_mx);
                    for (rr = rr_start; rr <= rr_end;
                        rr++) {
                        CLR_KEEPDIRTY(rr, un);
                    }
                    mutex_exit(&un->un_resync_mx);
                }
            }

            /*
             * Set resync_completed to last resync type and then
             * clear resync_type to indicate no resync in progress
             */
            un->un_resync_completed = un->un_rs_type;
            SET_RS_TYPE_NONE(un->un_rs_type);

            /*
             * If resync is as a result of a submirror ONLINE,
             * reset the submirror state to SMS_RUNNING if the
             * resync was ok else set back to SMS_OFFLINE.
             */
            for (smi = 0; smi < NMIRROR; smi++) {
                un->un_sm[smi].sm_flags &=
                    ~MD_SM_RESYNC_TARGET;
                if (SMS_BY_INDEX_IS(un, smi,
                    SMS_OFFLINE_RESYNC)) {
                    if (p->rs_flags &
                        MD_MN_RS_CLEAR_OPT_NOT_DONE) {
                        state = SMS_OFFLINE;
                    } else {
                        state = (broke_out ?
                            SMS_OFFLINE : SMS_RUNNING);
                    }
                    mirror_set_sm_state(
                        &un->un_sm[smi],
                        &un->un_smic[smi], state,
                        broke_out);
                    mirror_commit(un, NO_SUBMIRRORS,
                        0);
                }
                /*
                 * If we still have an offline submirror, reset
                 * the OFFLINE_SM flag in the mirror status
                 */
                if (SMS_BY_INDEX_IS(un, smi,
                    SMS_OFFLINE))
                    un->c.un_status |=
                        MD_UN_OFFLINE_SM;
            }
            md_ioctl_writerexit(lockp);
            break;
        case MD_RS_SUBMIRROR:
            un = md_ioctl_writerlock(lockp, ui);
            smi = RS_SMI(p->rs_type);
            sm = &un->un_sm[smi];
            smic = &un->un_smic[smi];
            /* Clear RESYNC target */
            un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
            /*
             * Set resync_completed to last resync type and then
             * clear resync_type to indicate no resync in progress
             */
            un->un_resync_completed = un->un_rs_type;
            SET_RS_TYPE_NONE(un->un_rs_type);
            /*
             * If the resync completed ok reset the submirror
             * state to SMS_RUNNING else reset it to SMS_ATTACHED
             */
            state = (broke_out ?
                SMS_ATTACHED : SMS_RUNNING);
            mirror_set_sm_state(sm, smic, state, broke_out);
            un->c.un_status &= ~MD_UN_WAR;
            mirror_commit(un, SMI2BIT(smi), 0);
            md_ioctl_writerexit(lockp);
            break;
        case MD_RS_COMPONENT:
            un = md_ioctl_writerlock(lockp, ui);
            smi = RS_SMI(p->rs_type);
            ci = RS_CI(p->rs_type);
            sm = &un->un_sm[smi];
            smic = &un->un_smic[smi];
            shared = (md_m_shared_t *)
                (*(smic->sm_shared_by_indx))
                (sm->sm_dev, sm, ci);
            un->c.un_status &= ~MD_UN_WAR;
            /* Clear RESYNC target */
            un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
            /*
             * Set resync_completed to last resync type and then
             * clear resync_type to indicate no resync in progress
             */
            un->un_resync_completed = un->un_rs_type;
            SET_RS_TYPE_NONE(un->un_rs_type);

            /*
             * If the resync completed ok, set the component state
             * to CS_OKAY.
             */
            if (broke_out)
                shared->ms_flags |= MDM_S_RS_TRIED;
            else {
                /*
                 * As we don't transmit the changes,
                 * no need to drop the lock.
                 */
                set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
                    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
            }
            md_ioctl_writerexit(lockp);
        default:
            break;
        }
        /*
         * If the purpose of this PHASE_DONE message is just to
         * indicate to all other nodes that the optimized resync
         * required (OPT_NOT_DONE) flag is to be cleared, there is
         * no need to generate a notify event as there has not
         * actually been a resync.
         */
        if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
            if (broke_out) {
                SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
                    SVM_TAG_METADEVICE, MD_UN2SET(un),
                    MD_SID(un));
            } else {
                SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
                    SVM_TAG_METADEVICE, MD_UN2SET(un),
                    MD_SID(un));
            }
        }
        break;

    default:
#ifdef DEBUG
        cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
            " %x\n", p->msg_type);
#endif
        return (EINVAL);
    }
    return (0);
}

/* Return a -1 if snarf of optimized record failed and set should be released */
static int
mirror_snarf(md_snarfcmd_t cmd, set_t setno)
{
    mddb_recid_t    recid;
    int     gotsomething;
    int     all_mirrors_gotten;
    mm_unit_t   *un;
    mddb_type_t typ1;
    mddb_de_ic_t    *dep;
    mddb_rb32_t *rbp;
    size_t      newreqsize;
    mm_unit_t   *big_un;
    mm_unit32_od_t  *small_un;
    int     retval;
    mdi_unit_t  *ui;

    if (cmd == MD_SNARF_CLEANUP) {
        if (md_get_setstatus(setno) & MD_SET_STALE)
            return (0);

        recid = mddb_makerecid(setno, 0);
        typ1 = (mddb_type_t)md_getshared_key(setno,
            mirror_md_ops.md_driver.md_drivername);
        while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
            if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
                un = (mm_unit_t *)mddb_getrecaddr(recid);
                mirror_cleanup(un);
                recid = mddb_makerecid(setno, 0);
            }
        }
        return (0);
    }

    all_mirrors_gotten = 1;
    gotsomething = 0;

    recid = mddb_makerecid(setno, 0);
    typ1 = (mddb_type_t)md_getshared_key(setno,
        mirror_md_ops.md_driver.md_drivername);

    while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
        if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
            continue;

        dep = mddb_getrecdep(recid);
        dep->de_flags = MDDB_F_MIRROR;
        rbp = dep->de_rb;

        switch (rbp->rb_revision) {
        case MDDB_REV_RB:
        case MDDB_REV_RBFN:
            if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
                /*
                 * This means, we have an old and small
                 * record and this record hasn't already
                 * been converted.  Before we create an
                 * incore metadevice from this we have to
                 * convert it to a big record.
                 */
                small_un =
                    (mm_unit32_od_t *)mddb_getrecaddr(recid);
                newreqsize = sizeof (mm_unit_t);
                big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
                    KM_SLEEP);
                mirror_convert((caddr_t)small_un,
                    (caddr_t)big_un, SMALL_2_BIG);
                kmem_free(small_un, dep->de_reqsize);

                /*
                 * Update userdata and incore userdata
                 * incores are at the end of un
                 */
                dep->de_rb_userdata_ic = big_un;
                dep->de_rb_userdata = big_un;
                dep->de_icreqsize = newreqsize;
                un = big_un;
                rbp->rb_private |= MD_PRV_CONVD;
            } else {
                /*
                 * Unit already converted, just get the
                 * record address.
                 */
                un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
                    sizeof (*un), 0);
            }
            un->c.un_revision &= ~MD_64BIT_META_DEV;
            break;
        case MDDB_REV_RB64:
        case MDDB_REV_RB64FN:
            /* Big device */
            un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
                sizeof (*un), 0);
            un->c.un_revision |= MD_64BIT_META_DEV;
            un->c.un_flag |= MD_EFILABEL;
            break;
        }
        MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);

        /*
         * Create minor device node for snarfed entry.
         */
        (void) md_create_minor_node(setno, MD_SID(un));

        if (MD_UNIT(MD_SID(un)) != NULL) {
            mddb_setrecprivate(recid, MD_PRV_PENDDEL);
            continue;
        }
        all_mirrors_gotten = 0;
        retval = mirror_build_incore(un, 1);
        if (retval == 0) {
            mddb_setrecprivate(recid, MD_PRV_GOTIT);
            md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
            resync_start_timeout(setno);
            gotsomething = 1;
        } else {
            return (retval);
        }
        /*
         * Set flag to indicate that the mirror has not yet
         * been through a reconfig. This flag is used for MN sets
         * when determining whether to update the mirror state from
         * the Master node.
         */
        if (MD_MNSET_SETNO(setno)) {
            ui = MDI_UNIT(MD_SID(un));
            ui->ui_tstate |= MD_RESYNC_NOT_DONE;
        }
    }

    if (!all_mirrors_gotten)
        return (gotsomething);

    recid = mddb_makerecid(setno, 0);
    while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
        if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
            mddb_setrecprivate(recid, MD_PRV_PENDDEL);

    return (0);
}

static int
mirror_halt(md_haltcmd_t cmd, set_t setno)
{
    unit_t      i;
    mdi_unit_t  *ui;
    minor_t     mnum;
    int     reset_mirror_flag = 0;

    if (cmd == MD_HALT_CLOSE)
        return (0);

    if (cmd == MD_HALT_OPEN)
        return (0);

    if (cmd == MD_HALT_UNLOAD)
        return (0);

    if (cmd == MD_HALT_CHECK) {
        for (i = 0; i < md_nunits; i++) {
            mnum = MD_MKMIN(setno, i);
            if ((ui = MDI_UNIT(mnum)) == NULL)
                continue;
            if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
                continue;
            if (md_unit_isopen(ui))
                return (1);
        }
        return (0);
    }

    if (cmd != MD_HALT_DOIT)
        return (1);

    for (i = 0; i < md_nunits; i++) {
        mnum = MD_MKMIN(setno, i);
        if ((ui = MDI_UNIT(mnum)) == NULL)
            continue;
        if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
            continue;
        reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);

        /* Set a flag if there is at least one mirror metadevice. */
        reset_mirror_flag = 1;
    }

    /*
     * Only wait for the global dr_timeout to finish
     *  - if there are mirror metadevices in this diskset or
     *  - if this is the local set since an unload of the md_mirror
     *    driver could follow a successful mirror halt in the local set.
     */
    if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
        while ((mirror_md_ops.md_head == NULL) &&
            (mirror_timeout.dr_timeout_id != 0))
            delay(md_hz);
    }

    return (0);
}

/*ARGSUSED3*/
static int
mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
{
    IOLOCK  lock;
    minor_t     mnum = getminor(*dev);
    set_t       setno;

    /*
     * When doing an open of a multi owner metadevice, check to see if this
     * node is a starting node and if a reconfig cycle is underway.
     * If so, the system isn't sufficiently set up enough to handle the
     * open (which involves I/O during sp_validate), so fail with ENXIO.
     */
    setno = MD_MIN2SET(mnum);
    if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
        (MD_SET_MNSET | MD_SET_MN_START_RC)) {
            return (ENXIO);
    }

    if (md_oflags & MD_OFLG_FROMIOCTL) {
        /*
         * This indicates that the caller is an ioctl service routine.
         * In this case we initialise our stack-based IOLOCK and pass
         * this into the internal open routine. This allows multi-owner
         * metadevices to avoid deadlocking if an error is encountered
         * during the open() attempt. The failure case is:
         * s-p -> mirror -> s-p (with error). Attempting to metaclear
         * this configuration would deadlock as the mirror code has to
         * send a state-update to the other nodes when it detects the
         * failure of the underlying submirror with an errored soft-part
         * on it. As there is a class1 message in progress (metaclear)
         * set_sm_comp_state() cannot send another class1 message;
         * instead we do not send a state_update message as the
         * metaclear is distributed and the failed submirror will be
         * cleared from the configuration by the metaclear.
         */
        IOLOCK_INIT(&lock);
        return (mirror_internal_open(getminor(*dev), flag, otyp,
            md_oflags, &lock));
    } else {
        return (mirror_internal_open(getminor(*dev), flag, otyp,
            md_oflags, (IOLOCK *)NULL));
    }
}


/*ARGSUSED1*/
static int
mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
{
    return (mirror_internal_close(getminor(dev), otyp, md_cflags,
        (IOLOCK *)NULL));
}


/*
 * This routine dumps memory to the disk.  It assumes that the memory has
 * already been mapped into mainbus space.  It is called at disk interrupt
 * priority when the system is in trouble.
 *
 */
static int
mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
{
    mm_unit_t   *un;
    dev_t       mapdev;
    int     result;
    int     smi;
    int     any_succeed = 0;
    int     save_result = 0;

    /*
     * Don't need to grab the unit lock.
     * Cause nothing else is suppose to be happenning.
     * Also dump is not suppose to sleep.
     */
    un = (mm_unit_t *)MD_UNIT(getminor(dev));

    if ((diskaddr_t)blkno >= un->c.un_total_blocks)
        return (EINVAL);

    if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
        return (EINVAL);

    for (smi = 0; smi < NMIRROR; smi++) {
        if (!SUBMIRROR_IS_WRITEABLE(un, smi))
            continue;
        mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
        result = bdev_dump(mapdev, addr, blkno, nblk);
        if (result)
            save_result = result;

        if (result == 0)
            any_succeed++;
    }

    if (any_succeed)
        return (0);

    return (save_result);
}

/*
 * NAME: mirror_probe_dev
 *
 * DESCRITPION: force opens every component of a mirror.
 *
 * On entry the unit writerlock is held
 */
static int
mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
{
    int     i;
    int     smi;
    int     ci;
    mm_unit_t   *un;
    int     md_devopen = 0;
    set_t       setno;
    int     sm_cnt;
    int     sm_unavail_cnt;

    if (md_unit_isopen(ui))
        md_devopen++;

    un = MD_UNIT(mnum);
    setno = MD_UN2SET(un);

    sm_cnt = 0;
    sm_unavail_cnt = 0;
    for (i = 0; i < NMIRROR; i++) {
        md_dev64_t tmpdev;
        mdi_unit_t  *sm_ui;

        if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
            continue;
        }

        sm_cnt++;
        tmpdev = un->un_sm[i].sm_dev;
        (void) md_layered_open(mnum, &tmpdev,
            MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
        un->un_sm[i].sm_dev = tmpdev;

        sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));

        /*
         * Logic similar to that in mirror_open_all_devs.  We set or
         * clear the submirror Unavailable bit.
         */
        (void) md_unit_writerlock(sm_ui);
        if (submirror_unavailable(un, i, 1)) {
            sm_ui->ui_tstate |= MD_INACCESSIBLE;
            sm_unavail_cnt++;
        } else {
            sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
        }
        md_unit_writerexit(sm_ui);
    }

    /*
     * If all of the submirrors are unavailable, the mirror is also
     * unavailable.
     */
    if (sm_cnt == sm_unavail_cnt) {
        ui->ui_tstate |= MD_INACCESSIBLE;
    } else {
        ui->ui_tstate &= ~MD_INACCESSIBLE;
    }

    /*
     * Start checking from probe failures. If failures occur we
     * set the appropriate erred state only if the metadevice is in
     * use. This is specifically to prevent unnecessary resyncs.
     * For instance if the disks were accidentally disconnected when
     * the system booted up then until the metadevice is accessed
     * (like file system mount) the user can shutdown, recable and
     * reboot w/o incurring a potentially huge resync.
     */

    smi = 0;
    ci = 0;
    while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {

        if (mirror_other_sources(un, smi, ci, 0) == 1) {
            /*
             * Note that for a MN set, there is no need to call
             * SE_NOTIFY as that is done when processing the
             * state change
             */
            if (md_devopen) {
                /*
                 * Never called from ioctl context,
                 * so (IOLOCK *)NULL
                 */
                set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
                    0, MD_STATE_XMIT, (IOLOCK *)NULL);
                if (!MD_MNSET_SETNO(setno)) {
                    SE_NOTIFY(EC_SVM_STATE,
                        ESC_SVM_LASTERRED,
                        SVM_TAG_METADEVICE, setno,
                        MD_SID(un));
                }
                continue;
            } else {
                (void) mirror_close_all_devs(un,
                    MD_OFLG_PROBEDEV);
                if (!MD_MNSET_SETNO(setno)) {
                    SE_NOTIFY(EC_SVM_STATE,
                        ESC_SVM_OPEN_FAIL,
                        SVM_TAG_METADEVICE, setno,
                        MD_SID(un));
                }
                mirror_openfail_console_info(un, smi, ci);
                return (ENXIO);
            }
        }

        /*
         * Note that for a MN set, there is no need to call
         * SE_NOTIFY as that is done when processing the
         * state change
         */
        if (md_devopen) {
            /* Never called from ioctl context, so (IOLOCK *)NULL */
            set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
                MD_STATE_XMIT, (IOLOCK *)NULL);
            if (!MD_MNSET_SETNO(setno)) {
                SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
                    SVM_TAG_METADEVICE, setno,
                    MD_SID(un));
            }
        }
        mirror_openfail_console_info(un, smi, ci);
        ci++;
    }

    if (MD_MNSET_SETNO(setno)) {
        send_poke_hotspares(setno);
    } else {
        (void) poke_hotspares();
    }
    (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);

    return (0);
}


static int
mirror_imp_set(
    set_t   setno
)
{

    mddb_recid_t    recid;
    int     gotsomething, i;
    mddb_type_t typ1;
    mddb_de_ic_t    *dep;
    mddb_rb32_t *rbp;
    mm_unit32_od_t  *un32;
    mm_unit_t   *un64;
    md_dev64_t  self_devt;
    minor_t     *self_id;   /* minor needs to be updated */
    md_parent_t *parent_id; /* parent needs to be updated */
    mddb_recid_t    *record_id; /* record id needs to be updated */
    mddb_recid_t    *optrec_id;
    md_dev64_t  tmpdev;


    gotsomething = 0;

    typ1 = (mddb_type_t)md_getshared_key(setno,
        mirror_md_ops.md_driver.md_drivername);
    recid = mddb_makerecid(setno, 0);

    while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
        if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
            continue;

        dep = mddb_getrecdep(recid);
        rbp = dep->de_rb;

        switch (rbp->rb_revision) {
        case MDDB_REV_RB:
        case MDDB_REV_RBFN:
            /*
             * Small device
             */
            un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
            self_id = &(un32->c.un_self_id);
            parent_id = &(un32->c.un_parent);
            record_id = &(un32->c.un_record_id);
            optrec_id = &(un32->un_rr_dirty_recid);

            for (i = 0; i < un32->un_nsm; i++) {
                tmpdev = md_expldev(un32->un_sm[i].sm_dev);
                un32->un_sm[i].sm_dev = md_cmpldev
                    (md_makedevice(md_major, MD_MKMIN(setno,
                    MD_MIN2UNIT(md_getminor(tmpdev)))));

                if (!md_update_minor(setno, mddb_getsidenum
                    (setno), un32->un_sm[i].sm_key))
                goto out;
            }
            break;
        case MDDB_REV_RB64:
        case MDDB_REV_RB64FN:
            un64 = (mm_unit_t *)mddb_getrecaddr(recid);
            self_id = &(un64->c.un_self_id);
            parent_id = &(un64->c.un_parent);
            record_id = &(un64->c.un_record_id);
            optrec_id = &(un64->un_rr_dirty_recid);

            for (i = 0; i < un64->un_nsm; i++) {
                tmpdev = un64->un_sm[i].sm_dev;
                un64->un_sm[i].sm_dev = md_makedevice
                    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
                    (md_getminor(tmpdev))));

                if (!md_update_minor(setno, mddb_getsidenum
                    (setno), un64->un_sm[i].sm_key))
                goto out;
            }
            break;
        }

        /*
         * If this is a top level and a friendly name metadevice,
         * update its minor in the namespace.
         */
        if ((*parent_id == MD_NO_PARENT) &&
            ((rbp->rb_revision == MDDB_REV_RBFN) ||
            (rbp->rb_revision == MDDB_REV_RB64FN))) {

            self_devt = md_makedevice(md_major, *self_id);
            if (!md_update_top_device_minor(setno,
                mddb_getsidenum(setno), self_devt))
                goto out;
        }

        /*
         * Update unit with the imported setno
         *
         */
        mddb_setrecprivate(recid, MD_PRV_GOTIT);

        *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
        if (*parent_id != MD_NO_PARENT)
            *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
        *record_id = MAKERECID(setno, DBID(*record_id));
        *optrec_id = MAKERECID(setno, DBID(*optrec_id));

        gotsomething = 1;
    }

out:
    return (gotsomething);
}

/*
 * NAME: mirror_check_offline
 *
 * DESCRIPTION: return offline_status = 1 if any submirrors are offline
 *
 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
 * ioctl.
 */
int
mirror_check_offline(md_dev64_t dev, int *offline_status)
{
    mm_unit_t       *un;
    md_error_t      mde = mdnullerror;

    if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
        return (EINVAL);
    *offline_status = 0;
    if (un->c.un_status & MD_UN_OFFLINE_SM)
        *offline_status = 1;
    return (0);
}

/*
 * NAME: mirror_inc_abr_count
 *
 * DESCRIPTION: increment the count of layered soft parts with ABR set
 *
 * Called from ioctl, so access to un_abr_count is protected by the global
 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
 */
int
mirror_inc_abr_count(md_dev64_t dev)
{
    mm_unit_t       *un;
    md_error_t      mde = mdnullerror;

    if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
        return (EINVAL);
    un->un_abr_count++;
    return (0);
}

/*
 * NAME: mirror_dec_abr_count
 *
 * DESCRIPTION: decrement the count of layered soft parts with ABR set
 *
 * Called from ioctl, so access to un_abr_count is protected by the global
 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
 */
int
mirror_dec_abr_count(md_dev64_t dev)
{
    mm_unit_t       *un;
    md_error_t      mde = mdnullerror;

    if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
        return (EINVAL);
    un->un_abr_count--;
    return (0);
}

static md_named_services_t mirror_named_services[] = {
    {(intptr_t (*)()) poke_hotspares,       "poke hotspares"    },
    {(intptr_t (*)()) mirror_rename_listkids,   MDRNM_LIST_URKIDS   },
    {mirror_rename_check,               MDRNM_CHECK     },
    {(intptr_t (*)()) mirror_renexch_update_kids,   MDRNM_UPDATE_KIDS   },
    {(intptr_t (*)()) mirror_exchange_parent_update_to,
            MDRNM_PARENT_UPDATE_TO},
    {(intptr_t (*)()) mirror_exchange_self_update_from_down,
            MDRNM_SELF_UPDATE_FROM_DOWN },
    {(intptr_t (*)())mirror_probe_dev,      "probe open test" },
    {(intptr_t (*)())mirror_check_offline,      MD_CHECK_OFFLINE },
    {(intptr_t (*)())mirror_inc_abr_count,      MD_INC_ABR_COUNT },
    {(intptr_t (*)())mirror_dec_abr_count,      MD_DEC_ABR_COUNT },
    { NULL,                     0           }
};

md_ops_t mirror_md_ops = {
    mirror_open,        /* open */
    mirror_close,       /* close */
    md_mirror_strategy, /* strategy */
    NULL,           /* print */
    mirror_dump,        /* dump */
    NULL,           /* read */
    NULL,           /* write */
    md_mirror_ioctl,    /* mirror_ioctl, */
    mirror_snarf,       /* mirror_snarf */
    mirror_halt,        /* mirror_halt */
    NULL,           /* aread */
    NULL,           /* awrite */
    mirror_imp_set,     /* import set */
    mirror_named_services
};

/* module specific initilization */
static void
init_init()
{
    md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);

    /* Initialize the parent and child save memory pools */
    mirror_parent_cache = kmem_cache_create("md_mirror_parent",
        sizeof (md_mps_t), 0, mirror_parent_constructor,
        mirror_parent_destructor, mirror_run_queue, NULL, NULL,
        0);

    mirror_child_cache = kmem_cache_create("md_mirror_child",
        sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
        mirror_child_constructor, mirror_child_destructor,
        mirror_run_queue, NULL, NULL, 0);

    /*
     * Insure wowbuf_size is a multiple of DEV_BSIZE,
     * then initialize wowbuf memory pool.
     */
    md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
    if (md_wowbuf_size <= 0)
        md_wowbuf_size = 2 * DEV_BSIZE;
    if (md_wowbuf_size > (32 * DEV_BSIZE))
        md_wowbuf_size = (32 * DEV_BSIZE);

    md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
    mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
        md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);

    mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
    mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);

    mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
}

/* module specific uninitilization (undo init_init()) */
static void
fini_uninit()
{
    kmem_cache_destroy(mirror_parent_cache);
    kmem_cache_destroy(mirror_child_cache);
    kmem_cache_destroy(mirror_wowblk_cache);
    mirror_parent_cache = mirror_child_cache =
        mirror_wowblk_cache = NULL;

    mutex_destroy(&mirror_timeout.dr_mx);
    mutex_destroy(&hotspare_request.dr_mx);
    mutex_destroy(&non_ff_drv_mutex);
}

/* define the module linkage */
MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())