lvm/md/md_subr.c

	md_subr.c revision d3d50737e566cade9a08d73d2af95105ac7cd960
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Driver for Virtual Disk.
 */
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <sys/proc.h>
#include <sys/t_lock.h>
#include <sys/dkio.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mkdev.h>
#include <sys/vtoc.h>
#include <sys/open.h>
#include <sys/file.h>
#include <vm/page.h>
#include <sys/callb.h>
#include <sys/disp.h>
#include <sys/modctl.h>
#include <sys/errno.h>
#include <sys/door.h>
#include <sys/lvm/mdmn_commd.h>
#include <sys/lvm/md_hotspares.h>

#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_names.h>

#include <sys/ddi.h>
#include <sys/proc.h>
#include <sys/sunddi.h>
#include <sys/esunddi.h>

#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>

#include <sys/sysevent/svm.h>
#include <sys/lvm/md_basic.h>


/*
 * Machine specific Hertz is kept here
 */
extern clock_t          md_hz;

/*
 * Externs.
 */
extern int          (*mdv_strategy_tstpnt)(buf_t *, int, void*);
extern major_t          md_major;
extern unit_t           md_nunits;
extern set_t            md_nsets;
extern md_set_t         md_set[];
extern md_set_io_t      md_set_io[];
extern md_ops_t         **md_ops;
extern md_ops_t         *md_opslist;
extern ddi_modhandle_t      *md_mods;
extern dev_info_t       *md_devinfo;

extern md_krwlock_t     md_unit_array_rw;
extern kmutex_t         md_mx;
extern kcondvar_t       md_cv;

extern md_krwlock_t     hsp_rwlp;
extern md_krwlock_t     ni_rwlp;

extern int          md_num_daemons;
extern int          md_status;
extern int          md_ioctl_cnt;
extern int          md_mtioctl_cnt;

extern struct metatransops  metatransops;
extern md_event_queue_t     *md_event_queue;
extern md_resync_t      md_cpr_resync;
extern int          md_done_daemon_threads;
extern int          md_ff_daemon_threads;


extern mddb_set_t   *mddb_setenter(set_t setno, int flag, int *errorcodep);
extern void     mddb_setexit(mddb_set_t *s);
extern void     *lookup_entry(struct nm_next_hdr *, set_t,
                side_t, mdkey_t, md_dev64_t, int);
extern struct nm_next_hdr   *get_first_record(set_t, int, int);
extern dev_t        getrootdev(void);

struct mdq_anchor   md_done_daemon; /* done request queue */
struct mdq_anchor   md_mstr_daemon; /* mirror error, WOW requests */
struct mdq_anchor   md_mhs_daemon;  /* mirror hotspare requests queue */
struct mdq_anchor   md_hs_daemon;   /* raid hotspare requests queue */
struct mdq_anchor   md_ff_daemonq;  /* failfast request queue */
struct mdq_anchor   md_mirror_daemon; /* mirror owner queue */
struct mdq_anchor   md_mirror_io_daemon; /* mirror owner i/o queue */
struct mdq_anchor   md_mirror_rs_daemon; /* mirror resync done queue */
struct mdq_anchor   md_sp_daemon;   /* soft-part error daemon queue */
struct mdq_anchor   md_mto_daemon;  /* mirror timeout daemon queue */

int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */
int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */
int md_mhs_daemon_threads = 1;  /* threads for md_mhs_daemon requestq */
int md_hs_daemon_threads = 1;   /* threads for md_hs_daemon requestq */
int md_ff_daemon_threads = 3;   /* threads for md_ff_daemon requestq */
int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
int md_sp_daemon_threads = 1;   /* threads for md_sp_daemon requestq */
int md_mto_daemon_threads = 1;  /* threads for md_mto_daemon requestq */

#ifdef DEBUG
/* Flag to switch on debug messages */
int md_release_reacquire_debug = 0; /* debug flag */
#endif

/*
 *
 * The md_request_queues is table of pointers to request queues and the number
 * of threads associated with the request queues.
 * When the number of threads is set to 1, then the order of execution is
 * sequential.
 * The number of threads for all the queues have been defined as global
 * variables to enable kernel tuning.
 *
 */

#define MD_DAEMON_QUEUES 11

md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
    {&md_done_daemon, &md_done_daemon_threads},
    {&md_mstr_daemon, &md_mstr_daemon_threads},
    {&md_hs_daemon, &md_hs_daemon_threads},
    {&md_ff_daemonq, &md_ff_daemon_threads},
    {&md_mirror_daemon, &md_mirror_daemon_threads},
    {&md_mirror_io_daemon, &md_mirror_daemon_threads},
    {&md_mirror_rs_daemon, &md_mirror_daemon_threads},
    {&md_sp_daemon, &md_sp_daemon_threads},
    {&md_mhs_daemon, &md_mhs_daemon_threads},
    {&md_mto_daemon, &md_mto_daemon_threads},
    {0, 0}
};

/*
 * Number of times a message is retried before issuing a warning to the operator
 */
#define MD_MN_WARN_INTVL    10

/*
 * Setting retry cnt to one (pre decremented) so that we actually do no
 * retries when committing/deleting a mddb rec. The underlying disk driver
 * does several retries to check if the disk is really dead or not so there
 * is no reason for us to retry on top of the drivers retries.
 */

uint_t          md_retry_cnt = 1; /* global so it can be patched */

/*
 * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
 * Again, made patchable here should it prove useful.
 */
uint_t          md_send_retry_limit = 30;

/*
 * Bug # 1212146
 * Before this change the user had to pass in a short aligned buffer because of
 * problems in some underlying device drivers.  This problem seems to have been
 * corrected in the underlying drivers so we will default to not requiring any
 * alignment.  If the user needs to check for a specific alignment,
 * md_uio_alignment_mask may be set in /etc/system to accomplish this.  To get
 * the behavior before this fix, the md_uio_alignment_mask would be set to 1,
 * to check for word alignment, it can be set to 3, for double word alignment,
 * it can be set to 7, etc.
 *
 * [Other part of fix is in function md_chk_uio()]
 */
static int      md_uio_alignment_mask = 0;

/*
 * for md_dev64_t translation
 */
struct md_xlate_table       *md_tuple_table;
struct md_xlate_major_table *md_major_tuple_table;
int             md_tuple_length;
uint_t              md_majortab_len;

/* Function declarations */

static int md_create_probe_rqlist(md_probedev_impl_t *plist,
            daemon_queue_t **hdr, intptr_t (*probe_test)());

/*
 * manipulate global status
 */
void
md_set_status(int bits)
{
    mutex_enter(&md_mx);
    md_status |= bits;
    mutex_exit(&md_mx);
}

void
md_clr_status(int bits)
{
    mutex_enter(&md_mx);
    md_status &= ~bits;
    mutex_exit(&md_mx);
}

int
md_get_status()
{
    int result;
    mutex_enter(&md_mx);
    result = md_status;
    mutex_exit(&md_mx);
    return (result);
}

void
md_set_setstatus(set_t setno, int bits)
{
    ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);

    mutex_enter(&md_mx);
    md_set[setno].s_status |= bits;
    mutex_exit(&md_mx);
}

void
md_clr_setstatus(set_t setno, int bits)
{
    ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);

    mutex_enter(&md_mx);
    md_set[setno].s_status &= ~bits;
    mutex_exit(&md_mx);
}

uint_t
md_get_setstatus(set_t setno)
{
    uint_t result;

    ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);

    mutex_enter(&md_mx);
    result = md_set[setno].s_status;
    mutex_exit(&md_mx);
    return (result);
}

/*
 * md_unit_readerlock_common:
 * -------------------------
 * Mark the given unit as having a reader reference. Spin waiting for any
 * writer references to be released.
 *
 * Input:
 *  ui      unit reference
 *  lock_held   0 => ui_mx needs to be grabbed
 *          1 => ui_mx already held
 * Output:
 *  mm_unit_t corresponding to unit structure
 *  ui->ui_readercnt incremented
 */
static void *
md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
{
    uint_t  flag = MD_UL_WRITER | MD_UL_WANABEWRITER;

    if (!lock_held)
        mutex_enter(&ui->ui_mx);
    while (ui->ui_lock & flag) {
        if (panicstr) {
            if (ui->ui_lock & MD_UL_WRITER)
                panic("md: writer lock is held");
            break;
        }
        cv_wait(&ui->ui_cv, &ui->ui_mx);
    }
    ui->ui_readercnt++;
    if (!lock_held)
        mutex_exit(&ui->ui_mx);
    return (MD_UNIT(ui->ui_link.ln_id));
}

void *
md_unit_readerlock(mdi_unit_t *ui)
{
    return (md_unit_readerlock_common(ui, 0));
}

/*
 * md_unit_writerlock_common:
 * -------------------------
 * Acquire a unique writer reference. Causes previous readers to drain.
 * Spins if a writer reference already exists or if a previous reader/writer
 * dropped the lock to allow a ksend_message to be despatched.
 *
 * Input:
 *  ui      unit reference
 *  lock_held   0 => grab ui_mx
 *          1 => ui_mx already held on entry
 * Output:
 *  mm_unit_t reference
 */
static void *
md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
{
    uint_t  flag = MD_UL_WRITER;

    if (panicstr)
        panic("md: writer lock not allowed");

    if (!lock_held)
        mutex_enter(&ui->ui_mx);

    while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
        ui->ui_wanabecnt++;
        ui->ui_lock |= MD_UL_WANABEWRITER;
        cv_wait(&ui->ui_cv, &ui->ui_mx);
        if (--ui->ui_wanabecnt == 0)
            ui->ui_lock &= ~MD_UL_WANABEWRITER;
    }
    ui->ui_lock |= MD_UL_WRITER;
    ui->ui_owner = curthread;

    if (!lock_held)
        mutex_exit(&ui->ui_mx);
    return (MD_UNIT(ui->ui_link.ln_id));
}

void *
md_unit_writerlock(mdi_unit_t *ui)
{
    return (md_unit_writerlock_common(ui, 0));
}

/*
 * md_unit_readerexit_common:
 * -------------------------
 * Release the readerlock for the specified unit. If the reader count reaches
 * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
 *
 * Input:
 *  ui      unit reference
 *  lock_held   0 => ui_mx needs to be acquired
 *          1 => ui_mx already held
 */
static void
md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
{
    if (!lock_held)
        mutex_enter(&ui->ui_mx);
    ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
    ASSERT(ui->ui_readercnt != 0);
    ui->ui_readercnt--;
    if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
        cv_broadcast(&ui->ui_cv);

    if (!lock_held)
        mutex_exit(&ui->ui_mx);
}

void
md_unit_readerexit(mdi_unit_t *ui)
{
    md_unit_readerexit_common(ui, 0);
}

/*
 * md_unit_writerexit_common:
 * -------------------------
 * Release the writerlock currently held on the unit. Wake any threads waiting
 * on becoming reader or writer (MD_UL_WANABEWRITER set).
 *
 * Input:
 *  ui      unit reference
 *  lock_held   0 => ui_mx to be acquired
 *          1 => ui_mx already held
 */
static void
md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
{
    if (!lock_held)
        mutex_enter(&ui->ui_mx);
    ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
    ASSERT(ui->ui_readercnt == 0);
    ui->ui_lock &= ~MD_UL_WRITER;
    ui->ui_owner = NULL;

    cv_broadcast(&ui->ui_cv);
    if (!lock_held)
        mutex_exit(&ui->ui_mx);
}

void
md_unit_writerexit(mdi_unit_t *ui)
{
    md_unit_writerexit_common(ui, 0);
}

void *
md_io_readerlock(mdi_unit_t *ui)
{
    md_io_lock_t    *io = ui->ui_io_lock;

    ASSERT(io);  /* checks case where no io lock allocated */
    mutex_enter(&io->io_mx);
    while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
        if (panicstr) {
            if (io->io_lock & MD_UL_WRITER)
                panic("md: writer lock is held");
            break;
        }
        cv_wait(&io->io_cv, &io->io_mx);
    }
    io->io_readercnt++;
    mutex_exit(&io->io_mx);
    return (MD_UNIT(ui->ui_link.ln_id));
}

void *
md_io_writerlock(mdi_unit_t *ui)
{
    md_io_lock_t    *io = ui->ui_io_lock;

    ASSERT(io);  /* checks case where no io lock allocated */
    if (panicstr)
        panic("md: writer lock not allowed");

    mutex_enter(&io->io_mx);
    while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
        io->io_wanabecnt++;
        io->io_lock |= MD_UL_WANABEWRITER;
        cv_wait(&io->io_cv, &io->io_mx);
        if (--io->io_wanabecnt == 0)
            io->io_lock &= ~MD_UL_WANABEWRITER;
    }
    io->io_lock |= MD_UL_WRITER;
    io->io_owner = curthread;

    mutex_exit(&io->io_mx);
    return (MD_UNIT(ui->ui_link.ln_id));
}

void
md_io_readerexit(mdi_unit_t *ui)
{
    md_io_lock_t    *io = ui->ui_io_lock;

    mutex_enter(&io->io_mx);
    ASSERT((io->io_lock & MD_UL_WRITER) == 0);
    ASSERT(io->io_readercnt != 0);
    io->io_readercnt--;
    if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
        cv_broadcast(&io->io_cv);
    }
    mutex_exit(&io->io_mx);
}

void
md_io_writerexit(mdi_unit_t *ui)
{
    md_io_lock_t    *io = ui->ui_io_lock;

    mutex_enter(&io->io_mx);
    ASSERT((io->io_lock & MD_UL_WRITER) != 0);
    ASSERT(io->io_readercnt == 0);
    io->io_lock &= ~MD_UL_WRITER;
    io->io_owner = NULL;

    cv_broadcast(&io->io_cv);
    mutex_exit(&io->io_mx);
}

/*
 * Attempt to grab that set of locks defined as global.
 * A mask containing the set of global locks that are owned upon
 * entry is input.  Any additional global locks are then grabbed.
 * This keeps the caller from having to know the set of global
 * locks.
 */
static int
md_global_lock_enter(int global_locks_owned_mask)
{

    /*
     * The current implementation has been verified by inspection
     * and test to be deadlock free.  If another global lock is
     * added, changing the algorithm used by this function should
     * be considered.  With more than 2 locks it is difficult to
     * guarantee that locks are being acquired in the correct order.
     * The safe approach would be to drop all of the locks that are
     * owned at function entry and then reacquire all of the locks
     * in the order defined by the lock hierarchy.
     */
    mutex_enter(&md_mx);
    if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
        while ((md_mtioctl_cnt != 0) ||
            (md_status & MD_GBL_IOCTL_LOCK)) {
            if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
                mutex_exit(&md_mx);
                return (EINTR);
            }
        }
        md_status |= MD_GBL_IOCTL_LOCK;
        md_ioctl_cnt++;
    }
    if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
        while (md_status & MD_GBL_HS_LOCK) {
            if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
                md_status &= ~MD_GBL_IOCTL_LOCK;
                mutex_exit(&md_mx);
                return (EINTR);
            }
        }
        md_status |= MD_GBL_HS_LOCK;
    }
    mutex_exit(&md_mx);
    return (0);
}

/*
 * Release the set of global locks that were grabbed in md_global_lock_enter
 * that were not already owned by the calling thread.  The set of previously
 * owned global locks is passed in as a mask parameter.
 */
static int
md_global_lock_exit(int global_locks_owned_mask, int code,
    int flags, mdi_unit_t *ui)
{
    mutex_enter(&md_mx);

    /* If MT ioctl decrement mt_ioctl_cnt */
    if ((flags & MD_MT_IOCTL)) {
        md_mtioctl_cnt--;
    } else {
        if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
            /* clear the lock and decrement count */
            ASSERT(md_ioctl_cnt == 1);
            md_ioctl_cnt--;
            md_status &= ~MD_GBL_IOCTL_LOCK;
        }
        if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
            md_status &= ~MD_GBL_HS_LOCK;
    }
    if (flags & MD_READER_HELD)
        md_unit_readerexit(ui);
    if (flags & MD_WRITER_HELD)
        md_unit_writerexit(ui);
    if (flags & MD_IO_HELD)
        md_io_writerexit(ui);
    if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
        rw_exit(&md_unit_array_rw.lock);
    }
    cv_broadcast(&md_cv);
    mutex_exit(&md_mx);

    return (code);
}

/*
 * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
 * use of the md_global_lock_{enter|exit} functions to avoid duplication
 * of code.  They rely upon the fact that the locks that are specified in
 * the input mask are not acquired or freed.  If this algorithm changes
 * as described in the block comment at the beginning of md_global_lock_enter
 * then it will be necessary to change these 2 functions.  Otherwise these
 * functions will be grabbing and holding global locks unnecessarily.
 */
int
md_ioctl_lock_enter(void)
{
    /* grab only the ioctl lock */
    return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
}

/*
 * If md_ioctl_lock_exit is being called at the end of an ioctl before
 * returning to user space, then ioctl_end is set to 1.
 * Otherwise, the ioctl lock is being dropped in the middle of handling
 * an ioctl and will be reacquired before the end of the ioctl.
 * Do not attempt to process the MN diskset mddb parse flags unless
 * ioctl_end is true - otherwise a deadlock situation could arise.
 */
int
md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
{
    int             ret_val;
    uint_t              status;
    mddb_set_t          *s;
    int             i;
    int             err;
    md_mn_msg_mddb_parse_t      *mddb_parse_msg;
    md_mn_kresult_t         *kresult;
    mddb_lb_t           *lbp;
    int             rval = 1;
    int             flag;

    /* release only the ioctl lock */
    ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);

    /*
     * If md_ioctl_lock_exit is being called with a possible lock held
     * (ioctl_end is 0), then don't check the MN disksets since the
     * call to mddb_setenter may cause a lock ordering deadlock.
     */
    if (!ioctl_end)
        return (ret_val);

    /*
     * Walk through disksets to see if there is a MN diskset that
     * has messages that need to be sent.  Set must be snarfed and
     * be a MN diskset in order to be checked.
     *
     * In a MN diskset, this routine may send messages to the
     * rpc.mdcommd in order to have the slave nodes re-parse parts
     * of the mddb.  Messages can only be sent with no locks held,
     * so if mddb change occurred while the ioctl lock is held, this
     * routine must send the messages.
     */
    for (i = 1; i < md_nsets; i++) {
        status = md_get_setstatus(i);

        /* Set must be snarfed and be a MN diskset */
        if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
            (MD_SET_SNARFED | MD_SET_MNSET))
            continue;

        /* Grab set lock so that set can't change */
        if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
            continue;

        lbp = s->s_lbp;

        /* Re-get set status now that lock is held */
        status = md_get_setstatus(i);

        /*
         * If MN parsing block flag is set - continue to next set.
         *
         * If s_mn_parseflags_sending is non-zero, then another thread
         * is already currently sending a parse message, so just
         * release the set mutex.  If this ioctl had caused an mddb
         * change that results in a parse message to be generated,
         * the thread that is currently sending a parse message would
         * generate the additional parse message.
         *
         * If s_mn_parseflags_sending is zero then loop until
         * s_mn_parseflags is 0 (until there are no more
         * messages to send).
         * While s_mn_parseflags is non-zero,
         *  put snapshot of parse_flags in s_mn_parseflags_sending
         *  set s_mn_parseflags to zero
         *  release set mutex
         *  send message
         *  re-grab set mutex
         *  set s_mn_parseflags_sending to zero
         *
         * If set is STALE, send message with NO_LOG flag so that
         * rpc.mdcommd won't attempt to log message to non-writeable
         * replica.
         */
        mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
            KM_SLEEP);
        while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
            (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
            (!(status & MD_SET_MNPARSE_BLK))) {

            /* Grab snapshot of parse flags */
            s->s_mn_parseflags_sending = s->s_mn_parseflags;
            s->s_mn_parseflags = 0;

            mutex_exit(&md_set[(s)->s_setno].s_dbmx);

            /*
             * Send the message to the slaves to re-parse
             * the indicated portions of the mddb. Send the status
             * of the 50 mddbs in this set so that slaves know
             * which mddbs that the master node thinks are 'good'.
             * Otherwise, slave may reparse, but from wrong
             * replica.
             */
            mddb_parse_msg->msg_parse_flags =
                s->s_mn_parseflags_sending;

            for (i = 0; i < MDDB_NLB; i++) {
                mddb_parse_msg->msg_lb_flags[i] =
                    lbp->lb_locators[i].l_flags;
            }
            kresult = kmem_zalloc(sizeof (md_mn_kresult_t),
                KM_SLEEP);
            while (rval != 0) {
                flag = 0;
                if (status & MD_SET_STALE)
                    flag |= MD_MSGF_NO_LOG;
                rval = mdmn_ksend_message(s->s_setno,
                    MD_MN_MSG_MDDB_PARSE, flag, 0,
                    (char *)mddb_parse_msg,
                    sizeof (md_mn_msg_mddb_parse_t), kresult);
                /* if the node hasn't yet joined, it's Ok. */
                if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
                    (kresult->kmmr_comm_state !=
                    MDMNE_NOT_JOINED)) {
                    mdmn_ksend_show_error(rval, kresult,
                        "MD_MN_MSG_MDDB_PARSE");
                    cmn_err(CE_WARN, "md_ioctl_lock_exit: "
                        "Unable to send mddb update "
                        "message to other nodes in "
                        "diskset %s\n", s->s_setname);
                    rval = 1;
                }
            }
            kmem_free(kresult, sizeof (md_mn_kresult_t));

            /*
             * Re-grab mutex to clear sending field and to
             * see if another parse message needs to be generated.
             */
            mutex_enter(&md_set[(s)->s_setno].s_dbmx);
            s->s_mn_parseflags_sending = 0;
        }
        kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
        mutex_exit(&md_set[(s)->s_setno].s_dbmx);
    }
    return (ret_val);
}

/*
 * Called when in an ioctl and need readerlock.
 */
void *
md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
{
    ASSERT(lock != NULL);
    lock->l_ui = ui;
    lock->l_flags |= MD_READER_HELD;
    return (md_unit_readerlock_common(ui, 0));
}

/*
 * Called when in an ioctl and need writerlock.
 */
void *
md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
{
    ASSERT(lock != NULL);
    lock->l_ui = ui;
    lock->l_flags |= MD_WRITER_HELD;
    return (md_unit_writerlock_common(ui, 0));
}

void *
md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
{
    ASSERT(lock != NULL);
    lock->l_ui = ui;
    lock->l_flags |= MD_IO_HELD;
    return (md_io_writerlock(ui));
}

void
md_ioctl_readerexit(IOLOCK *lock)
{
    ASSERT(lock != NULL);
    lock->l_flags &= ~MD_READER_HELD;
    md_unit_readerexit(lock->l_ui);
}

void
md_ioctl_writerexit(IOLOCK *lock)
{
    ASSERT(lock != NULL);
    lock->l_flags &= ~MD_WRITER_HELD;
    md_unit_writerexit(lock->l_ui);
}

void
md_ioctl_io_exit(IOLOCK *lock)
{
    ASSERT(lock != NULL);
    lock->l_flags &= ~MD_IO_HELD;
    md_io_writerexit(lock->l_ui);
}

/*
 * md_ioctl_releaselocks:
 * --------------------
 * Release the unit locks that are held and stop subsequent
 * md_unit_reader/writerlock calls from progressing. This allows the caller
 * to send messages across the cluster when running in a multinode
 * environment.
 * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
 * allowed to progress as normal. This is required as these typically are
 * invoked by the message handler that may be called while a unit lock is
 * marked as released.
 *
 * On entry:
 *  variety of unit locks may be held including ioctl lock
 *
 * On exit:
 *      locks released and unit structure updated to prevent subsequent reader/
 *      writer locks being acquired until md_ioctl_reacquirelocks is called
 */
void
md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
{
    /* This actually releases the locks. */
    (void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
}

/*
 * md_ioctl_reacquirelocks:
 * ----------------------
 * Reacquire the locks that were held when md_ioctl_releaselocks
 * was called.
 *
 * On entry:
 *      No unit locks held
 * On exit:
 *  locks held that were held at md_ioctl_releaselocks time including
 *  the ioctl lock.
 */
void
md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
{
    if (flags & MD_MT_IOCTL) {
        mutex_enter(&md_mx);
        md_mtioctl_cnt++;
        mutex_exit(&md_mx);
    } else {
        while (md_ioctl_lock_enter() == EINTR)
            ;
    }
    if (flags & MD_ARRAY_WRITER) {
        rw_enter(&md_unit_array_rw.lock, RW_WRITER);
    } else if (flags & MD_ARRAY_READER) {
        rw_enter(&md_unit_array_rw.lock, RW_READER);
    }
    if (ui != (mdi_unit_t *)NULL) {
        if (flags & MD_IO_HELD) {
            (void) md_io_writerlock(ui);
        }

        mutex_enter(&ui->ui_mx);
        if (flags & MD_READER_HELD) {
            (void) md_unit_readerlock_common(ui, 1);
        } else if (flags & MD_WRITER_HELD) {
            (void) md_unit_writerlock_common(ui, 1);
        }
        /* Wake up any blocked readerlock() calls */
        cv_broadcast(&ui->ui_cv);
        mutex_exit(&ui->ui_mx);
    }
}

void
md_ioctl_droplocks(IOLOCK *lock)
{
    mdi_unit_t  *ui;
    int     flags;

    ASSERT(lock != NULL);
    ui = lock->l_ui;
    flags = lock->l_flags;
    if (flags & MD_READER_HELD) {
        lock->l_flags &= ~MD_READER_HELD;
        md_unit_readerexit(ui);
    }
    if (flags & MD_WRITER_HELD) {
        lock->l_flags &= ~MD_WRITER_HELD;
        md_unit_writerexit(ui);
    }
    if (flags & MD_IO_HELD) {
        lock->l_flags &= ~MD_IO_HELD;
        md_io_writerexit(ui);
    }
    if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
        lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
        rw_exit(&md_unit_array_rw.lock);
    }
}

void
md_array_writer(IOLOCK *lock)
{
    ASSERT(lock != NULL);
    lock->l_flags |= MD_ARRAY_WRITER;
    rw_enter(&md_unit_array_rw.lock, RW_WRITER);
}

void
md_array_reader(IOLOCK *lock)
{
    ASSERT(lock != NULL);
    lock->l_flags |= MD_ARRAY_READER;
    rw_enter(&md_unit_array_rw.lock, RW_READER);
}

/*
 * Called when in an ioctl and need opencloselock.
 * Sets flags in lockp for READER_HELD.
 */
void *
md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
{
    void    *un;

    ASSERT(lockp != NULL);
    mutex_enter(&ui->ui_mx);
    while (ui->ui_lock & MD_UL_OPENORCLOSE)
        cv_wait(&ui->ui_cv, &ui->ui_mx);
    ui->ui_lock |= MD_UL_OPENORCLOSE;

    /* Maintain mutex across the readerlock call */
    lockp->l_ui = ui;
    lockp->l_flags |= MD_READER_HELD;
    un = md_unit_readerlock_common(ui, 1);
    mutex_exit(&ui->ui_mx);

    return (un);
}

/*
 * Clears reader lock using md_ioctl instead of md_unit
 * and updates lockp.
 */
void
md_ioctl_openclose_exit(IOLOCK *lockp)
{
    mdi_unit_t  *ui;

    ASSERT(lockp != NULL);
    ui = lockp->l_ui;
    ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);

    md_ioctl_readerexit(lockp);

    mutex_enter(&ui->ui_mx);
    ui->ui_lock &= ~MD_UL_OPENORCLOSE;

    cv_broadcast(&ui->ui_cv);
    mutex_exit(&ui->ui_mx);
}

/*
 * Clears reader lock using md_ioctl instead of md_unit
 * and updates lockp.
 * Does not acquire or release the ui_mx lock since the calling
 * routine has already acquired this lock.
 */
void
md_ioctl_openclose_exit_lh(IOLOCK *lockp)
{
    mdi_unit_t  *ui;

    ASSERT(lockp != NULL);
    ui = lockp->l_ui;
    ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);

    lockp->l_flags &= ~MD_READER_HELD;
    md_unit_readerexit_common(lockp->l_ui, 1);

    ui->ui_lock &= ~MD_UL_OPENORCLOSE;
    cv_broadcast(&ui->ui_cv);
}

void *
md_unit_openclose_enter(mdi_unit_t *ui)
{
    void    *un;

    mutex_enter(&ui->ui_mx);
    while (ui->ui_lock & (MD_UL_OPENORCLOSE))
        cv_wait(&ui->ui_cv, &ui->ui_mx);
    ui->ui_lock |= MD_UL_OPENORCLOSE;

    /* Maintain mutex across the readerlock call */
    un = md_unit_readerlock_common(ui, 1);
    mutex_exit(&ui->ui_mx);

    return (un);
}

void
md_unit_openclose_exit(mdi_unit_t *ui)
{
    md_unit_readerexit(ui);

    mutex_enter(&ui->ui_mx);
    ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
    ui->ui_lock &= ~MD_UL_OPENORCLOSE;

    cv_broadcast(&ui->ui_cv);
    mutex_exit(&ui->ui_mx);
}

/*
 * Drop the openclose and readerlocks without acquiring or
 * releasing the ui_mx lock since the calling routine has
 * already acquired this lock.
 */
void
md_unit_openclose_exit_lh(mdi_unit_t *ui)
{
    md_unit_readerexit_common(ui, 1);
    ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
    ui->ui_lock &= ~MD_UL_OPENORCLOSE;
    cv_broadcast(&ui->ui_cv);
}

int
md_unit_isopen(
    mdi_unit_t  *ui
)
{
    int     isopen;

    /* check status */
    mutex_enter(&ui->ui_mx);
    isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
    mutex_exit(&ui->ui_mx);
    return (isopen);
}

int
md_unit_incopen(
    minor_t     mnum,
    int     flag,
    int     otyp
)
{
    mdi_unit_t  *ui = MDI_UNIT(mnum);
    int     err = 0;

    /* check type and flags */
    ASSERT(ui != NULL);
    mutex_enter(&ui->ui_mx);
    if ((otyp < 0) || (otyp >= OTYPCNT)) {
        err = EINVAL;
        goto out;
    }
    if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
        (ui->ui_lock & MD_UL_EXCL)) {
        err = EBUSY;
        goto out;
    }

    /* count and flag open */
    ui->ui_ocnt[otyp]++;
    ui->ui_lock |= MD_UL_OPEN;
    if (flag & FEXCL)
        ui->ui_lock |= MD_UL_EXCL;

    /* setup kstat, return success */
    mutex_exit(&ui->ui_mx);
    md_kstat_init(mnum);
    return (0);

    /* return error */
out:
    mutex_exit(&ui->ui_mx);
    return (err);
}

int
md_unit_decopen(
    minor_t     mnum,
    int     otyp
)
{
    mdi_unit_t  *ui = MDI_UNIT(mnum);
    int     err = 0;
    unsigned    i;

    /* check type and flags */
    ASSERT(ui != NULL);
    mutex_enter(&ui->ui_mx);
    if ((otyp < 0) || (otyp >= OTYPCNT)) {
        err = EINVAL;
        goto out;
    } else if (ui->ui_ocnt[otyp] == 0) {
        err = ENXIO;
        goto out;
    }

    /* count and flag closed */
    if (otyp == OTYP_LYR)
        ui->ui_ocnt[otyp]--;
    else
        ui->ui_ocnt[otyp] = 0;
    ui->ui_lock &= ~MD_UL_OPEN;
    for (i = 0; (i < OTYPCNT); ++i)
        if (ui->ui_ocnt[i] != 0)
            ui->ui_lock |= MD_UL_OPEN;
    if (! (ui->ui_lock & MD_UL_OPEN))
        ui->ui_lock &= ~MD_UL_EXCL;

    /* teardown kstat, return success */
    if (! (ui->ui_lock & MD_UL_OPEN)) {

        /*
         * We have a race condition inherited from specfs between
         * open() and close() calls. This results in the kstat
         * for a pending I/O being torn down, and then a panic.
         * To avoid this, only tear the kstat down if there are
         * no other readers on this device.
         */
        if (ui->ui_readercnt > 1) {
            mutex_exit(&ui->ui_mx);
        } else {
            mutex_exit(&ui->ui_mx);
            md_kstat_destroy(mnum);
        }
        return (0);
    }

    /* return success */
out:
    mutex_exit(&ui->ui_mx);
    return (err);
}

md_dev64_t
md_xlate_targ_2_mini(md_dev64_t targ_devt)
{
    dev32_t     mini_32_devt, targ_32_devt;
    int     i;

    /*
     * check to see if we're in an upgrade situation
     * if we are not in upgrade just return the input device
     */

    if (!MD_UPGRADE)
        return (targ_devt);

    targ_32_devt = md_cmpldev(targ_devt);

    i = 0;
    while (i != md_tuple_length) {
        if (md_tuple_table[i].targ_devt == targ_32_devt) {
            mini_32_devt = md_tuple_table[i].mini_devt;
            return (md_expldev((md_dev64_t)mini_32_devt));
        }
        i++;
    }
    return (NODEV64);
}

md_dev64_t
md_xlate_mini_2_targ(md_dev64_t mini_devt)
{
    dev32_t     mini_32_devt, targ_32_devt;
    int     i;

    if (!MD_UPGRADE)
        return (mini_devt);

    mini_32_devt = md_cmpldev(mini_devt);

    i = 0;
    while (i != md_tuple_length) {
        if (md_tuple_table[i].mini_devt == mini_32_devt) {
            targ_32_devt = md_tuple_table[i].targ_devt;
            return (md_expldev((md_dev64_t)targ_32_devt));
        }
        i++;
    }
    return (NODEV64);
}

void
md_xlate_free(int size)
{
    kmem_free(md_tuple_table, size);
}

char *
md_targ_major_to_name(major_t maj)
{
    char *drv_name = NULL;
    int i;

    if (!MD_UPGRADE)
        return (ddi_major_to_name(maj));

    for (i = 0; i < md_majortab_len; i++) {
        if (md_major_tuple_table[i].targ_maj == maj) {
            drv_name = md_major_tuple_table[i].drv_name;
            break;
        }
    }
    return (drv_name);
}

major_t
md_targ_name_to_major(char *drv_name)
{
    major_t maj;
    int i;

    maj = md_getmajor(NODEV64);
    if (!MD_UPGRADE)
        return (ddi_name_to_major(drv_name));

    for (i = 0; i < md_majortab_len; i++) {
        if ((strcmp(md_major_tuple_table[i].drv_name,
            drv_name)) == 0) {
            maj = md_major_tuple_table[i].targ_maj;
            break;
        }
    }

    return (maj);
}

void
md_majortab_free()
{
    size_t  sz;
    int i;

    for (i = 0; i < md_majortab_len; i++) {
        freestr(md_major_tuple_table[i].drv_name);
    }

    sz = md_majortab_len * sizeof (struct md_xlate_major_table);
    kmem_free(md_major_tuple_table, sz);
}

/* functions return a pointer to a function which returns an int */

intptr_t (*
md_get_named_service(md_dev64_t dev, int modindex, char *name,
    intptr_t (*Default)()))()
{
    mdi_unit_t      *ui;
    md_named_services_t *sp;
    int         i;

    /*
     * Return the first named service found.
     * Use this path when it is known that there is only
     * one named service possible (e.g., hotspare interface)
     */
    if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
        for (i = 0; i < MD_NOPS; i++) {
            if (md_ops[i] == NULL) {
                continue;
            }
            sp = md_ops[i]->md_services;
            if (sp == NULL)
                continue;
            while (sp->md_service != NULL) {
                if (strcmp(name, sp->md_name) == 0)
                    return (sp->md_service);
                sp++;
            }
        }
        return (Default);
    }

    /*
     * Return the named service for the given modindex.
     * This is used if there are multiple possible named services
     * and each one needs to be called (e.g., poke hotspares)
     */
    if (dev == NODEV64) {
        if (modindex >= MD_NOPS)
            return (Default);

        if (md_ops[modindex] == NULL)
            return (Default);

        sp = md_ops[modindex]->md_services;
        if (sp == NULL)
            return (Default);

        while (sp->md_service != NULL) {
            if (strcmp(name, sp->md_name) == 0)
                return (sp->md_service);
            sp++;
        }
        return (Default);
    }

    /*
     * Return the named service for this md_dev64_t
     */
    if (md_getmajor(dev) != md_major)
        return (Default);

    if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
        (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
        return (NULL);


    if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
        return (NULL);

    sp = md_ops[ui->ui_opsindex]->md_services;
    if (sp == NULL)
        return (Default);
    while (sp->md_service != NULL) {
        if (strcmp(name, sp->md_name) == 0)
            return (sp->md_service);
        sp++;
    }
    return (Default);
}

/*
 * md_daemon callback routine
 */
boolean_t
callb_md_cpr(void *arg, int code)
{
    callb_cpr_t *cp = (callb_cpr_t *)arg;
    int ret = 0;                /* assume success */
    clock_t delta;

    mutex_enter(cp->cc_lockp);

    switch (code) {
    case CB_CODE_CPR_CHKPT:
        /*
         * Check for active resync threads
         */
        mutex_enter(&md_cpr_resync.md_resync_mutex);
        if ((md_cpr_resync.md_mirror_resync > 0) ||
            (md_cpr_resync.md_raid_resync > 0)) {
            mutex_exit(&md_cpr_resync.md_resync_mutex);
            cmn_err(CE_WARN, "There are Solaris Volume Manager "
                "synchronization threads running.");
            cmn_err(CE_WARN, "Please try system suspension at "
                "a later time.");
            ret = -1;
            break;
        }
        mutex_exit(&md_cpr_resync.md_resync_mutex);

        cp->cc_events |= CALLB_CPR_START;
        delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
        while (!(cp->cc_events & CALLB_CPR_SAFE))
            /* cv_reltimedwait() returns -1 if it times out. */
            if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
                cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
                break;
            break;

    case CB_CODE_CPR_RESUME:
        cp->cc_events &= ~CALLB_CPR_START;
        cv_signal(&cp->cc_stop_cv);
        break;
    }
    mutex_exit(cp->cc_lockp);
    return (ret != -1);
}

void
md_daemon(int pass_thru, mdq_anchor_t *anchor)
{
    daemon_queue_t  *dq;
    callb_cpr_t cprinfo;

    if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
        return;
    /*
     * Register cpr callback
     */
    CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");

    /*CONSTCOND*/
    while (1) {
        mutex_enter(&anchor->a_mx);
        while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
            if (pass_thru) {
                /*
                 * CALLB_CPR_EXIT Will do
                 * mutex_exit(&anchor->a_mx)
                 */
                CALLB_CPR_EXIT(&cprinfo);
                return;
            }
            if (md_get_status() & MD_GBL_DAEMONS_DIE) {
                mutex_exit(&anchor->a_mx);
                mutex_enter(&md_mx);
                md_num_daemons--;
                mutex_exit(&md_mx);
                /*
                 * CALLB_CPR_EXIT will do
                 * mutex_exit(&anchor->a_mx)
                 */
                mutex_enter(&anchor->a_mx);
                CALLB_CPR_EXIT(&cprinfo);
                thread_exit();
            }
            CALLB_CPR_SAFE_BEGIN(&cprinfo);
            cv_wait(&anchor->a_cv, &anchor->a_mx);
            CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
        }
        dq->dq_prev->dq_next = dq->dq_next;
        dq->dq_next->dq_prev = dq->dq_prev;
        dq->dq_prev = dq->dq_next = NULL;
        anchor->dq.qlen--;
        mutex_exit(&anchor->a_mx);
        (*(dq->dq_call))(dq);
    }
    /*NOTREACHED*/
}

/*
 * daemon_request:
 *
 * Adds requests to appropriate requestq which is
 * anchored by *anchor.
 * The request is the first element of a doubly linked circular list.
 * When the request is a single element, the forward and backward
 * pointers MUST point to the element itself.
 */

void
daemon_request(mdq_anchor_t *anchor, void (*func)(),
                daemon_queue_t *request, callstyle_t style)
{
    daemon_queue_t *rqtp;
    int i = 0;

    rqtp = request;
    if (style == REQ_OLD) {
        ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
        /* set it to the new style */
        rqtp->dq_prev = rqtp->dq_next = rqtp;
    }
    ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));

    /* scan the list and add the function to each element */

    do {
        rqtp->dq_call = func;
        i++;
        rqtp = rqtp->dq_next;
    } while (rqtp != request);

    /* save pointer to tail of the request list */
    rqtp = request->dq_prev;

    mutex_enter(&anchor->a_mx);
    /* stats */
    anchor->dq.qlen += i;
    anchor->dq.treqs += i;
    anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
        anchor->dq.qlen : anchor->dq.maxq_len;

    /* now add the list to request queue */
    request->dq_prev = anchor->dq.dq_prev;
    rqtp->dq_next = &anchor->dq;
    anchor->dq.dq_prev->dq_next = request;
    anchor->dq.dq_prev = rqtp;
    cv_broadcast(&anchor->a_cv);
    mutex_exit(&anchor->a_mx);
}

void
mddb_commitrec_wrapper(mddb_recid_t recid)
{
    int sent_log = 0;
    uint_t retry = md_retry_cnt;
    set_t   setno;

    while (mddb_commitrec(recid)) {
        if (! sent_log) {
            cmn_err(CE_WARN,
                "md: state database commit failed");
            sent_log = 1;
        }
        delay(md_hz);

        /*
         * Setting retry cnt to one (pre decremented) so that we
         * actually do no retries when committing/deleting a mddb rec.
         * The underlying disk driver does several retries to check
         * if the disk is really dead or not so there
         * is no reason for us to retry on top of the drivers retries.
         */

        if (--retry == 0) {
            setno = mddb_getsetnum(recid);
            if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
                panic(
                    "md: Panic due to lack of DiskSuite state\n"
                    " database replicas. Fewer than 50%% of "
                    "the total were available,\n so panic to "
                    "ensure data integrity.");
            } else {
                panic("md: state database problem");
            }
            /*NOTREACHED*/
        }
    }
}

void
mddb_commitrecs_wrapper(mddb_recid_t *recids)
{
    int sent_log = 0;
    uint_t retry = md_retry_cnt;
    set_t   setno;

    while (mddb_commitrecs(recids)) {
        if (! sent_log) {
            cmn_err(CE_WARN,
                "md: state database commit failed");
            sent_log = 1;
        }
        delay(md_hz);

        /*
         * Setting retry cnt to one (pre decremented) so that we
         * actually do no retries when committing/deleting a mddb rec.
         * The underlying disk driver does several retries to check
         * if the disk is really dead or not so there
         * is no reason for us to retry on top of the drivers retries.
         */

        if (--retry == 0) {
            /*
             * since all the records are part of the same set
             * use the first one to get setno
             */
            setno = mddb_getsetnum(*recids);
            if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
                panic(
                    "md: Panic due to lack of DiskSuite state\n"
                    " database replicas. Fewer than 50%% of "
                    "the total were available,\n so panic to "
                    "ensure data integrity.");
            } else {
                panic("md: state database problem");
            }
            /*NOTREACHED*/
        }
    }
}

void
mddb_deleterec_wrapper(mddb_recid_t recid)
{
    int sent_log = 0;
    uint_t retry = md_retry_cnt;
    set_t   setno;

    while (mddb_deleterec(recid)) {
        if (! sent_log) {
            cmn_err(CE_WARN,
                "md: state database delete failed");
            sent_log = 1;
        }
        delay(md_hz);

        /*
         * Setting retry cnt to one (pre decremented) so that we
         * actually do no retries when committing/deleting a mddb rec.
         * The underlying disk driver does several retries to check
         * if the disk is really dead or not so there
         * is no reason for us to retry on top of the drivers retries.
         */

        if (--retry == 0) {
            setno = mddb_getsetnum(recid);
            if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
                panic(
                    "md: Panic due to lack of DiskSuite state\n"
                    " database replicas. Fewer than 50%% of "
                    "the total were available,\n so panic to "
                    "ensure data integrity.");
            } else {
                panic("md: state database problem");
            }
            /*NOTREACHED*/
        }
    }
}

/*
 * md_holdset_enter is called in order to hold the set in its
 * current state (loaded, unloaded, snarfed, unsnarfed, etc)
 * until md_holdset_exit is called.  This is used by the mirror
 * code to mark the set as HOLD so that the set won't be
 * unloaded while hotspares are being allocated in check_4_hotspares.
 * The original fix to the mirror code to hold the set was to call
 * md_haltsnarf_enter, but this will block all ioctls and ioctls
 * must work for a MN diskset while hotspares are allocated.
 */
void
md_holdset_enter(set_t setno)
{
    mutex_enter(&md_mx);
    while (md_set[setno].s_status & MD_SET_HOLD)
        cv_wait(&md_cv, &md_mx);
    md_set[setno].s_status |= MD_SET_HOLD;
    mutex_exit(&md_mx);
}

void
md_holdset_exit(set_t setno)
{
    mutex_enter(&md_mx);
    md_set[setno].s_status &= ~MD_SET_HOLD;
    cv_broadcast(&md_cv);
    mutex_exit(&md_mx);
}

/*
 * Returns a 0 if this thread marked the set as HOLD (success),
 * returns a -1 if set was already marked HOLD (failure).
 * Used by the release_set code to see if set is marked HOLD.
 * HOLD is set by a daemon when hotspares are being allocated
 * to mirror units.
 */
int
md_holdset_testandenter(set_t setno)
{
    mutex_enter(&md_mx);
    if (md_set[setno].s_status & MD_SET_HOLD) {
        mutex_exit(&md_mx);
        return (-1);
    }
    md_set[setno].s_status |= MD_SET_HOLD;
    mutex_exit(&md_mx);
    return (0);
}

void
md_haltsnarf_enter(set_t setno)
{
    mutex_enter(&md_mx);
    while (md_set[setno].s_status & MD_SET_SNARFING)
        cv_wait(&md_cv, &md_mx);

    md_set[setno].s_status |= MD_SET_SNARFING;
    mutex_exit(&md_mx);
}

void
md_haltsnarf_exit(set_t setno)
{
    mutex_enter(&md_mx);
    md_set[setno].s_status &= ~MD_SET_SNARFING;
    cv_broadcast(&md_cv);
    mutex_exit(&md_mx);
}

void
md_haltsnarf_wait(set_t setno)
{
    mutex_enter(&md_mx);
    while (md_set[setno].s_status & MD_SET_SNARFING)
        cv_wait(&md_cv, &md_mx);
    mutex_exit(&md_mx);
}

/*
 * ASSUMED that the md_unit_array_rw WRITER lock is held.
 */
int
md_halt_set(set_t setno, enum md_haltcmd cmd)
{
    int i, err;

    if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
        return (0);
    }

    if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
        for (i = 0; i < MD_NOPS; i++) {
            if (md_ops[i] == NULL)
                continue;
            if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
                for (--i; i > 0; --i) {
                    if (md_ops[i] == NULL)
                        continue;
                    (void) (*(md_ops[i]->md_halt))
                        (MD_HALT_OPEN, setno);
                }
                return (EBUSY);
            }
        }

        for (i = 0; i < MD_NOPS; i++) {
            if (md_ops[i] == NULL)
                continue;
            if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
                for (i = 0; i < MD_NOPS; i++) {
                    if (md_ops[i] == NULL)
                        continue;
                    (void) (*(md_ops[i]->md_halt))
                        (MD_HALT_OPEN, setno);
                }
                return (EBUSY);
            }
        }
    }

    if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
        for (i = 0; i < MD_NOPS; i++) {
            if (md_ops[i] == NULL)
                continue;
            err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
            if (err != 0)
                cmn_err(CE_NOTE,
                    "md: halt failed for %s, error %d",
                    md_ops[i]->md_driver.md_drivername, err);
        }

        /*
         * Unload the devid namespace if it is loaded
         */
        md_unload_namespace(setno, NM_DEVID);
        md_unload_namespace(setno, 0L);
        md_clr_setstatus(setno, MD_SET_SNARFED);
    }

    return (0);
}

int
md_halt(int global_locks_owned_mask)
{
    set_t           i, j;
    int         err;
    int         init_queues;
    md_requestq_entry_t *rqp;
    md_ops_t        **pops, *ops, *lops;
    ddi_modhandle_t     mod;
    char            *name;

    rw_enter(&md_unit_array_rw.lock, RW_WRITER);

    /*
     * Grab the all of the global locks that are not
     * already owned to ensure that there isn't another
     * thread trying to access a global resource
     * while the halt is in progress
     */
    if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
        return (EINTR);

    for (i = 0; i < md_nsets; i++)
        md_haltsnarf_enter(i);

    /*
     * Kill the daemon threads.
     */
    init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
    md_clr_status(MD_GBL_DAEMONS_LIVE);
    md_set_status(MD_GBL_DAEMONS_DIE);

    rqp = &md_daemon_queues[0];
    i = 0;
    while (!NULL_REQUESTQ_ENTRY(rqp)) {
        cv_broadcast(&rqp->dispq_headp->a_cv);
        rqp = &md_daemon_queues[++i];
    }

    mutex_enter(&md_mx);
    while (md_num_daemons != 0) {
        mutex_exit(&md_mx);
        delay(md_hz);
        mutex_enter(&md_mx);
    }
    mutex_exit(&md_mx);
    md_clr_status(MD_GBL_DAEMONS_DIE);

    for (i = 0; i < md_nsets; i++)
        /*
         * Only call into md_halt_set if s_un / s_ui are both set.
         * If they are NULL this set hasn't been accessed, so its
         * pointless performing the call.
         */
        if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
            if (md_halt_set(i, MD_HALT_CHECK)) {
                if (md_start_daemons(init_queues))
                    cmn_err(CE_WARN,
                        "md: restart of daemon threads "
                        "failed");
                for (j = 0; j < md_nsets; j++)
                    md_haltsnarf_exit(j);

                return (md_global_lock_exit(
                    global_locks_owned_mask, EBUSY,
                    MD_ARRAY_WRITER, NULL));
            }
        }

    /*
     * if we get here we are going to do it
     */
    for (i = 0; i < md_nsets; i++) {
        /*
         * Only call into md_halt_set if s_un / s_ui are both set.
         * If they are NULL this set hasn't been accessed, so its
         * pointless performing the call.
         */
        if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
            err = md_halt_set(i, MD_HALT_DOIT);
            if (err != 0)
                cmn_err(CE_NOTE,
                    "md: halt failed set %u, error %d",
                    (unsigned)i, err);
        }
    }

    /*
     * issue a halt unload to each module to indicate that it
     * is about to be unloaded.  Each module is called once, set
     * has no meaning at this point in time.
     */
    for (i = 0; i < MD_NOPS; i++) {
        if (md_ops[i] == NULL)
            continue;
        err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
        if (err != 0)
            cmn_err(CE_NOTE,
                "md: halt failed for %s, error %d",
                md_ops[i]->md_driver.md_drivername, err);
    }

    /* ddi_modclose the submodules */
    for (i = 0; i < MD_NOPS; i++) {
        /* skip if not open */
        if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
            continue;

        /* find and unlink from md_opslist */
        ops = md_ops[i];
        mod = md_mods[i];
        pops = &md_opslist;
        for (lops = *pops; lops;
            pops = &lops->md_next, lops = *pops) {
            if (lops == ops) {
                *pops = ops->md_next;
                ops->md_next = NULL;
                break;
            }
        }

        /* uninitialize */
        name = ops->md_driver.md_drivername;
        md_ops[i] = NULL;
        md_mods[i] = NULL;
        ops->md_selfindex = 0;
        ops->md_driver.md_drivername[0] = '\0';
        rw_destroy(&ops->md_link_rw.lock);

        /* close */
        err = ddi_modclose(mod);
        if (err != 0)
            cmn_err(CE_NOTE,
                "md: halt close failed for %s, error %d",
                name ? name : "UNKNOWN", err);
    }

    /* Unload the database */
    mddb_unload();

    md_set_status(MD_GBL_HALTED);   /* we are ready to be unloaded */

    for (i = 0; i < md_nsets; i++)
        md_haltsnarf_exit(i);

    return (md_global_lock_exit(global_locks_owned_mask, 0,
        MD_ARRAY_WRITER, NULL));
}

/*
 * md_layered_open() is an internal routine only for SVM modules.
 * So the input device will be a md_dev64_t, because all SVM modules internally
 * work with that device type.
 * ddi routines on the other hand work with dev_t. So, if we call any ddi
 * routines from here we first have to convert that device into a dev_t.
 */

int
md_layered_open(
    minor_t     mnum,
    md_dev64_t  *dev,
    int     md_oflags
)
{
    int     flag = (FREAD | FWRITE);
    cred_t      *cred_p = kcred;
    major_t     major;
    int     err;
    dev_t       ddi_dev = md_dev64_to_dev(*dev);

    if (ddi_dev == NODEV)
        return (ENODEV);

    major = getmajor(ddi_dev);

    /* metadevice */
    if (major == md_major) {
        mdi_unit_t  *ui;

        /* open underlying driver */
        mnum = getminor(ddi_dev);

        ui = MDI_UNIT(mnum);
        if (md_ops[ui->ui_opsindex]->md_open != NULL) {
            int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
                flag, OTYP_LYR, cred_p, md_oflags);
            /*
             * As open() may change the device,
             * send this info back to the caller.
             */
            *dev = md_expldev(ddi_dev);
            return (ret);
        }

        /* or do it ourselves */
        (void) md_unit_openclose_enter(ui);
        err = md_unit_incopen(mnum, flag, OTYP_LYR);
        md_unit_openclose_exit(ui);
        /* convert our ddi_dev back to the dev we were given */
        *dev = md_expldev(ddi_dev);
        return (err);
    }

    /*
     * Open regular device, since open() may change dev_t give new dev_t
     * back to the caller.
     */
    err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
    *dev = md_expldev(ddi_dev);
    return (err);
}

/*
 * md_layered_close() is an internal routine only for SVM modules.
 * So the input device will be a md_dev64_t, because all SVM modules internally
 * work with that device type.
 * ddi routines on the other hand work with dev_t. So, if we call any ddi
 * routines from here we first have to convert that device into a dev_t.
 */
void
md_layered_close(
    md_dev64_t  dev,
    int     md_cflags
)
{
    int     flag = (FREAD | FWRITE);
    cred_t      *cred_p = kcred;
    dev_t       ddi_dev = md_dev64_to_dev(dev);
    major_t     major = getmajor(ddi_dev);
    minor_t     mnum = getminor(ddi_dev);

    /* metadevice */
    if (major == md_major) {
        mdi_unit_t  *ui = MDI_UNIT(mnum);

        /* close underlying driver */
        if (md_ops[ui->ui_opsindex]->md_close != NULL) {
            (*md_ops[ui->ui_opsindex]->md_close)
                (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
            return;
        }

        /* or do it ourselves */
        (void) md_unit_openclose_enter(ui);
        (void) md_unit_decopen(mnum, OTYP_LYR);
        md_unit_openclose_exit(ui);
        return;
    }

    /* close regular device */
    (void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
}

/*
 * saves a little code in mdstrategy
 */
int
errdone(mdi_unit_t *ui, struct buf *bp, int err)
{
    if ((bp->b_error = err) != 0)
        bp->b_flags |= B_ERROR;
    else
        bp->b_resid = bp->b_bcount;
    md_unit_readerexit(ui);
    md_biodone(bp);
    return (1);
}

static int  md_write_label = 0;

int
md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
{
    diskaddr_t endblk;
    set_t   setno = MD_UN2SET(un);

    if ((md_get_setstatus(setno) & MD_SET_STALE) &&
        (! (bp->b_flags & B_READ)))
        return (errdone(ui, bp, EROFS));
    /*
     * Check early for unreasonable block number.
     *
     * b_blkno is defined as adaddr_t which is typedef'd to a long.
     * A problem occurs if b_blkno has bit 31 set and un_total_blocks
     * doesn't, b_blkno is then compared as a negative number which is
     * always less than a positive.
     */
    if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
        return (errdone(ui, bp, EINVAL));

    if (bp->b_lblkno == un->c.un_total_blocks)
        return (errdone(ui, bp, 0));

    /*
     * make sure we don't clobber any labels
     */
    if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
        (un->c.un_flag & MD_LABELED) && (! md_write_label)) {
        cmn_err(CE_NOTE, "md: %s: write to label",
            md_shortname(getminor(bp->b_edev)));
        return (errdone(ui, bp, EINVAL));
    }

    bp->b_resid = 0;
    endblk = (diskaddr_t)(bp->b_lblkno +
        howmany(bp->b_bcount, DEV_BSIZE) - 1);

    if (endblk > (un->c.un_total_blocks - 1)) {
        bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
        endblk = un->c.un_total_blocks - 1;
        bp->b_bcount -= bp->b_resid;
    }
    return (0);
}

/*
 * init_request_queue: initializes the request queues and creates the threads.
 *  return value =  0  :invalid num_threads
 *           =  n   : n is the number of threads created.
 */

int
init_requestq(
    md_requestq_entry_t *rq, /* request queue info */
    void (*threadfn)(),  /* function to start the thread */
    caddr_t threadfn_args,   /* args to the function */
    int pri,         /* thread priority */
    int init_queue)      /* flag to init queues */
{
    struct mdq_anchor *rqhead;
    int i;
    int num_threads;


    num_threads = *(rq->num_threadsp);
    rqhead = rq->dispq_headp;

    if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
        return (0);

    if (init_queue) {
        rqhead->dq.maxq_len = 0;
        rqhead->dq.treqs = 0;
        rqhead->dq.dq_next = &rqhead->dq;
        rqhead->dq.dq_prev = &rqhead->dq;
        cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
        mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
    }
    for (i = 0; i < num_threads; i++) {
        (void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
            TS_RUN, pri);
    }
    return (i);
}

static void
start_daemon(struct mdq_anchor *q)
{
    md_daemon(0, q);
    ASSERT(0);
}

/*
 * Creates all the md daemons.
 * Global:
 *  md_num_daemons is set to number of daemons.
 *  MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
 *
 * Return value: 0  success
 *       1  failure
 */
int
md_start_daemons(int init_queue)
{
    md_requestq_entry_t *rqp;
    int cnt;
    int i;
    int retval = 0;


    if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
        return (retval);
    }
    md_clr_status(MD_GBL_DAEMONS_DIE);

    rqp = &md_daemon_queues[0];
    i = 0;
    while (!NULL_REQUESTQ_ENTRY(rqp)) {
        cnt = init_requestq(rqp, start_daemon,
            (caddr_t)rqp->dispq_headp, minclsyspri, init_queue);

        if (cnt && cnt != *rqp->num_threadsp) {
            retval = 1;
            break;
        }
        /*
         * initialize variables
         */
        md_num_daemons += cnt;
        rqp = &md_daemon_queues[++i];
    }

    md_set_status(MD_GBL_DAEMONS_LIVE);
    return (retval);
}

int
md_loadsubmod(set_t setno, char *name, int drvrid)
{
    ddi_modhandle_t mod;
    md_ops_t    **pops, *ops;
    int     i, err;

    /*
     * See if the submodule is mdopened. If not, i is the index of the
     * next empty slot.
     */
    for (i = 0; md_ops[i] != NULL; i++) {
        if (strncmp(name, md_ops[i]->md_driver.md_drivername,
            MD_DRIVERNAMELEN) == 0)
            return (i);

        if (i == (MD_NOPS - 1))
            return (-1);
    }

    if (drvrid < 0) {
        /* Do not try to add any records to the DB when stale. */
        if (md_get_setstatus(setno) & MD_SET_STALE)
            return (-1);
        drvrid = md_setshared_name(setno, name, 0L);
    }

    if (drvrid < 0)
        return (-1);

    /* open and import the md_ops of the submodules */
    mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
    if (mod == NULL) {
        cmn_err(CE_WARN, "md_loadsubmod: "
            "unable to ddi_modopen %s, error %d\n", name, err);
        return (-1);
    }
    pops = ddi_modsym(mod, "md_interface_ops", &err);
    if (pops == NULL) {
        cmn_err(CE_WARN, "md_loadsubmod: "
            "unable to import md_interface_ops from %s, error %d\n",
            name, err);
        (void) ddi_modclose(mod);
        return (-1);
    }

    /* ddi_modsym returns pointer to md_interface_ops in submod */
    ops = *pops;

    /* initialize */
    ops->md_selfindex = i;
    rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
    (void) strncpy(ops->md_driver.md_drivername, name,
        MD_DRIVERNAMELEN);

    /* plumb */
    md_ops[i] = ops;
    md_mods[i] = mod;
    ops->md_next = md_opslist;
    md_opslist = ops;

    /* return index */
    return (i);
}

int
md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
{
    int i;
    int modindex;
    char    *name = driver->md_drivername;
    set_t   setno = driver->md_setno;
    int drvid;
    int local_dont_load;

    if (setno >= md_nsets)
        return (-1);

    for (i = 0; name[i] != 0; i++)
        if (i == (MD_DRIVERNAMELEN -1))
            return (-1);

    /*
     * If set is STALE, set local_dont_load to 1 since no records
     * should be added to DB when stale.
     */
    if (md_get_setstatus(setno) & MD_SET_STALE) {
        local_dont_load = 1;
    } else {
        local_dont_load = dont_load;
    }

    /*
     * Single thread ioctl module binding with respect to
     * similar code executed in md_loadsubmod that is called
     * from md_snarf_db_set (which is where that path does
     * its md_haltsnarf_enter call).
     */
    md_haltsnarf_enter(setno);

    /* See if the submodule is already ddi_modopened. */
    for (i = 0; md_ops[i] != NULL; i++) {
        if (strncmp(name, md_ops[i]->md_driver.md_drivername,
            MD_DRIVERNAMELEN) == 0) {
            if (! local_dont_load &&
                (md_getshared_key(setno, name) == MD_KEYBAD)) {
                if (md_setshared_name(setno, name, 0L)
                    == MD_KEYBAD) {
                    if (!db_notrequired)
                        goto err;
                }
            }
            md_haltsnarf_exit(setno);
            return (i);
        }

        if (i == (MD_NOPS -1))
            break;
    }

    if (local_dont_load)
        goto err;

    drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));

    /* ddi_modopen the submodule */
    modindex = md_loadsubmod(setno, name, drvid);
    if (modindex < 0)
        goto err;

    if (md_ops[modindex]->md_snarf != NULL)
        (*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);

    md_haltsnarf_exit(setno);
    return (modindex);

err:    md_haltsnarf_exit(setno);
    return (-1);
}

void
md_call_strategy(buf_t *bp, int flags, void *private)
{
    mdi_unit_t  *ui;

    if (mdv_strategy_tstpnt)
        if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
            return;
    if (getmajor(bp->b_edev) != md_major) {
        (void) bdev_strategy(bp);
        return;
    }

    flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
    ui = MDI_UNIT(getminor(bp->b_edev));
    ASSERT(ui != NULL);
    (*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
}

/*
 * md_call_ioctl:
 * -------------
 * Issue the specified ioctl to the device associated with the given md_dev64_t
 *
 * Arguments:
 *  dev - underlying device [md_dev64_t]
 *  cmd - ioctl to perform
 *  data    - arguments / result location
 *  mode    - read/write/layered ioctl
 *  lockp   - lock reference
 *
 * Returns:
 *  0   success
 *  !=0 Failure (error code)
 */
int
md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
{
    dev_t       device = md_dev64_to_dev(dev);
    int     rval;
    mdi_unit_t  *ui;

    /*
     * See if device is a metadevice. If not call cdev_ioctl(), otherwise
     * call the ioctl entry-point in the metadevice.
     */
    if (md_getmajor(dev) != md_major) {
        int rv;
        rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
            ddi_get_cred(), &rv);
    } else {
        ui = MDI_UNIT(md_getminor(dev));
        ASSERT(ui != NULL);
        rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
            mode, lockp);
    }
    return (rval);
}

void
md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
{
    md_link_t   *next;
    md_link_t   **pprev;

    rw_enter(rw, RW_WRITER);

    next = *head;
    pprev = head;
    while (next) {
        if ((next->ln_setno == setno) && (next->ln_id == id)) {
            *pprev = next->ln_next;
            rw_exit(rw);
            return;
        }
        pprev = &next->ln_next;
        next = next->ln_next;
    }

    rw_exit(rw);
}

int
md_dev_exists(md_dev64_t dev)
{

    if (dev == NODEV64)
        return (0);

    if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
        return (1);

    if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
        (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
        return (0);

    if (MDI_UNIT(md_getminor(dev)) != NULL)
        return (1);

    return (0);
}

md_parent_t
md_get_parent(md_dev64_t dev)
{
    md_unit_t   *un;
    mdi_unit_t  *ui;
    md_parent_t parent;

    if (md_getmajor(dev) != md_major)
        return (MD_NO_PARENT);

    ui = MDI_UNIT(md_getminor(dev));

    un = (md_unit_t *)md_unit_readerlock(ui);
    parent = un->c.un_parent;
    md_unit_readerexit(ui);

    return (parent);
}

void
md_set_parent(md_dev64_t dev, md_parent_t parent)
{
    md_unit_t   *un;
    mdi_unit_t  *ui;

    if (md_getmajor(dev) != md_major)
        return;

    ui = MDI_UNIT(md_getminor(dev));

    un = (md_unit_t *)md_unit_readerlock(ui);
    un->c.un_parent = parent;
    md_unit_readerexit(ui);
}

void
md_reset_parent(md_dev64_t dev)
{
    md_unit_t   *un;
    mdi_unit_t  *ui;

    if (md_getmajor(dev) != md_major)
        return;

    ui = MDI_UNIT(md_getminor(dev));

    un = (md_unit_t *)md_unit_readerlock(ui);
    un->c.un_parent = MD_NO_PARENT;
    md_unit_readerexit(ui);
}


static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;

int
md_hot_spare_ifc(
    hs_cmds_t   cmd,
    mddb_recid_t    id,
    u_longlong_t    size,
    int     labeled,
    mddb_recid_t    *hs_id,
    mdkey_t     *key,
    md_dev64_t  *dev,
    diskaddr_t  *sblock)
{
    int     err;

    /*
     * RW lock on hot_spare_interface. We don't want it to change from
     * underneath us. If hot_spare_interface is NULL we're going to
     * need to set it. So we need to upgrade to a WRITER lock. If that
     * doesn't work, we drop the lock and reenter as WRITER. This leaves
     * a small hole during which hot_spare_interface could be modified
     * so we check it for NULL again. What a pain. Then if still null
     * load from md_get_named_service.
     */

    rw_enter(&hsp_rwlp.lock, RW_READER);
    if (hot_spare_interface == NULL) {
        if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
            rw_exit(&hsp_rwlp.lock);
            rw_enter(&hsp_rwlp.lock, RW_WRITER);
            if (hot_spare_interface != NULL) {
                err = ((*hot_spare_interface)
                    (cmd, id, size, labeled, hs_id, key, dev,
                    sblock));
                rw_exit(&hsp_rwlp.lock);
                return (err);
            }
        }
        hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
            "hot spare interface", 0);
        rw_downgrade(&hsp_rwlp.lock);
    }

    if (hot_spare_interface == NULL) {
        cmn_err(CE_WARN, "md: no hotspare interface");
        rw_exit(&hsp_rwlp.lock);
        return (0);
    }

    err = ((*hot_spare_interface)
        (cmd, id, size, labeled, hs_id, key, dev, sblock));
    rw_exit(&hsp_rwlp.lock);
    return (err);
}

void
md_clear_hot_spare_interface()
{
    rw_enter(&hsp_rwlp.lock, RW_WRITER);
    hot_spare_interface = NULL;
    rw_exit(&hsp_rwlp.lock);
}


static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;

int
md_notify_interface(
    md_event_cmds_t cmd,
    md_tags_t   tag,
    set_t       set,
    md_dev64_t  dev,
    md_event_type_t event
)
{
    int     err;

    if (md_event_queue == NULL)
        return (0);
    rw_enter(&ni_rwlp.lock, RW_READER);
    if (notify_interface == NULL) {
        if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
            rw_exit(&ni_rwlp.lock);
            rw_enter(&ni_rwlp.lock, RW_WRITER);
            if (notify_interface != NULL) {
                err = ((*notify_interface)
                    (cmd, tag, set, dev, event));
                rw_exit(&ni_rwlp.lock);
                return (err);
            }
        }
        notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
            "notify interface", 0);
        rw_downgrade(&ni_rwlp.lock);
    }
    if (notify_interface == NULL) {
        cmn_err(CE_WARN, "md: no notify interface");
        rw_exit(&ni_rwlp.lock);
        return (0);
    }
    err = ((*notify_interface)(cmd, tag, set, dev, event));
    rw_exit(&ni_rwlp.lock);
    return (err);
}

char *
obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
{
    char        *setname;
    char        name[MD_MAX_CTDLEN];
    minor_t     mnum = md_getminor(dev);
    major_t     maj = md_getmajor(dev);
    int     rtn = 0;

    /*
     * Verify that the passed dev_t refers to a valid metadevice.
     * If it doesn't we can make no assumptions as to what the device
     * name is. Return NULL in these cases.
     */
    if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
        (MD_MIN2SET(mnum) >= md_nsets)) {
        return (NULL);
    }

    setname = NULL;
    name[0] = '\0';
    switch (tag) {
    case SVM_TAG_HSP:
        if (setno == 0) {
            rtn = snprintf(name, sizeof (name), "hsp%u",
                (unsigned)MD_MIN2UNIT(mnum));
        } else {
            setname = mddb_getsetname(setno);
            if (setname != NULL) {
                rtn = snprintf(name, sizeof (name), "%s/hsp%u",
                    setname, (unsigned)MD_MIN2UNIT(mnum));
            }
        }
        break;
    case SVM_TAG_DRIVE:
        (void) sprintf(name, "drive");
        break;
    case SVM_TAG_HOST:
        (void) sprintf(name, "host");
        break;
    case SVM_TAG_SET:
        rtn = snprintf(name, sizeof (name), "%s",
            mddb_getsetname(setno));
        if ((name[0] == '\0') || (rtn >= sizeof (name))) {
            (void) sprintf(name, "diskset");
            rtn = 0;
        }
        break;
    default:
        rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
        break;
    }

    /* Check if we got any rubbish for any of the snprintf's */
    if ((name[0] == '\0') || (rtn >= sizeof (name))) {
        return (NULL);
    }

    return (md_strdup(name));
}

/* Sysevent subclass and mdnotify event type pairs */
struct node {
    char        *se_ev;
    md_event_type_t md_ev;
};

/*
 * Table must be sorted in case sensitive ascending order of
 * the sysevents values
 */
static struct node ev_table[] = {
    { ESC_SVM_ADD,          EQ_ADD },
    { ESC_SVM_ATTACH,       EQ_ATTACH },
    { ESC_SVM_ATTACHING,        EQ_ATTACHING },
    { ESC_SVM_CHANGE,       EQ_CHANGE },
    { ESC_SVM_CREATE,       EQ_CREATE },
    { ESC_SVM_DELETE,       EQ_DELETE },
    { ESC_SVM_DETACH,       EQ_DETACH },
    { ESC_SVM_DETACHING,        EQ_DETACHING },
    { ESC_SVM_DRIVE_ADD,        EQ_DRIVE_ADD },
    { ESC_SVM_DRIVE_DELETE,     EQ_DRIVE_DELETE },
    { ESC_SVM_ENABLE,       EQ_ENABLE },
    { ESC_SVM_ERRED,        EQ_ERRED },
    { ESC_SVM_EXCHANGE,     EQ_EXCHANGE },
    { ESC_SVM_GROW,         EQ_GROW },
    { ESC_SVM_HS_CHANGED,       EQ_HS_CHANGED },
    { ESC_SVM_HS_FREED,     EQ_HS_FREED },
    { ESC_SVM_HOST_ADD,     EQ_HOST_ADD },
    { ESC_SVM_HOST_DELETE,      EQ_HOST_DELETE },
    { ESC_SVM_HOTSPARED,        EQ_HOTSPARED },
    { ESC_SVM_INIT_FAILED,      EQ_INIT_FAILED },
    { ESC_SVM_INIT_FATAL,       EQ_INIT_FATAL },
    { ESC_SVM_INIT_START,       EQ_INIT_START },
    { ESC_SVM_INIT_SUCCESS,     EQ_INIT_SUCCESS },
    { ESC_SVM_IOERR,        EQ_IOERR },
    { ESC_SVM_LASTERRED,        EQ_LASTERRED },
    { ESC_SVM_MEDIATOR_ADD,     EQ_MEDIATOR_ADD },
    { ESC_SVM_MEDIATOR_DELETE,  EQ_MEDIATOR_DELETE },
    { ESC_SVM_OFFLINE,      EQ_OFFLINE },
    { ESC_SVM_OK,           EQ_OK },
    { ESC_SVM_ONLINE,       EQ_ONLINE },
    { ESC_SVM_OPEN_FAIL,        EQ_OPEN_FAIL },
    { ESC_SVM_REGEN_DONE,       EQ_REGEN_DONE },
    { ESC_SVM_REGEN_FAILED,     EQ_REGEN_FAILED },
    { ESC_SVM_REGEN_START,      EQ_REGEN_START },
    { ESC_SVM_RELEASE,      EQ_RELEASE },
    { ESC_SVM_REMOVE,       EQ_REMOVE },
    { ESC_SVM_RENAME_DST,       EQ_RENAME_DST },
    { ESC_SVM_RENAME_SRC,       EQ_RENAME_SRC },
    { ESC_SVM_REPLACE,      EQ_REPLACE },
    { ESC_SVM_RESYNC_DONE,      EQ_RESYNC_DONE },
    { ESC_SVM_RESYNC_FAILED,    EQ_RESYNC_FAILED },
    { ESC_SVM_RESYNC_START,     EQ_RESYNC_START },
    { ESC_SVM_RESYNC_SUCCESS,   EQ_RESYNC_SUCCESS },
    { ESC_SVM_TAKEOVER,     EQ_TAKEOVER }
};

static md_tags_t md_tags[] = {
    TAG_UNK,
    TAG_METADEVICE,
    TAG_UNK,
    TAG_UNK,
    TAG_UNK,
    TAG_UNK,
    TAG_REPLICA,
    TAG_HSP,
    TAG_HS,
    TAG_SET,
    TAG_DRIVE,
    TAG_HOST,
    TAG_MEDIATOR
};

md_event_type_t
ev_get(char *subclass)
{
    int high, mid, low, p;

    low = 0;
    high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
    while (low <= high) {
        mid = (high + low) / 2;
        p = strcmp(subclass, ev_table[mid].se_ev);
        if (p == 0) {
            return (ev_table[mid].md_ev);
        } else if (p < 0) {
            high = mid - 1;
        } else {
            low = mid + 1;
        }
    }

    return (EQ_EMPTY);
}

/*
 * Log mdnotify event
 */
void
do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
{
    md_event_type_t ev_type;
    md_tags_t   md_tag;

    /* Translate sysevent into mdnotify event */
    ev_type = ev_get(se_subclass);

    if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
        md_tag = TAG_UNK;
    } else {
        md_tag = md_tags[tag];
    }

    NOTIFY_MD(md_tag, setno, devid, ev_type);
}

/*
 * Log SVM sys events
 */
void
svm_gen_sysevent(
    char        *se_class,
    char        *se_subclass,
    uint32_t    tag,
    set_t       setno,
    md_dev64_t  devid
)
{
    nvlist_t        *attr_list;
    sysevent_id_t       eid;
    int         err = DDI_SUCCESS;
    char            *devname;
    extern dev_info_t   *md_devinfo;

    /* Raise the mdnotify event before anything else */
    do_mdnotify(se_subclass, tag, setno, devid);

    if (md_devinfo == NULL) {
        return;
    }

    err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);

    if (err == DDI_SUCCESS) {
        /* Add the version numver */
        err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
            (uint32_t)SVM_VERSION);
        if (err != DDI_SUCCESS) {
            goto fail;
        }

        /* Add the tag attribute */
        err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
        if (err != DDI_SUCCESS) {
            goto fail;
        }

        /* Add the set number attribute */
        err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
        if (err != DDI_SUCCESS) {
            goto fail;
        }

        /* Add the device id attribute */
        err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
        if (err != DDI_SUCCESS) {
            goto fail;
        }

        /* Add the device name attribute */
        devname = obj2devname(tag, setno, devid);
        if (devname != NULL) {
            err = nvlist_add_string(attr_list, SVM_DEV_NAME,
                devname);
            freestr(devname);
        } else {
            err = nvlist_add_string(attr_list, SVM_DEV_NAME,
                "unspecified");
        }
        if (err != DDI_SUCCESS) {
            goto fail;
        }

        /* Attempt to post event */
        err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
            se_subclass, attr_list, &eid, DDI_SLEEP);

        nvlist_free(attr_list);
        if (err != DDI_SUCCESS) {
            cmn_err(CE_WARN, "Failed to log event for %s, %s,"
                " err=%x", se_class, se_subclass, err);
        }
    }

    return;

fail:
    nvlist_free(attr_list);
    cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
        se_class, se_subclass, err);
}

void
md_clear_named_service()
{
    rw_enter(&ni_rwlp.lock, RW_WRITER);
    notify_interface = NULL;
    rw_exit(&ni_rwlp.lock);
}

void
md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
{
    mdi_unit_t  *ui;
    set_t       setno = MD_MIN2SET(mnum);

    ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
    ui->ui_opsindex = ops->md_selfindex;

    /* initialize all the incore conditional variables */
    mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);

    if (alloc_lock) {
        ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
        mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
        mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
            MUTEX_DEFAULT, NULL);
        ui->ui_io_lock->io_list_front = NULL;
        ui->ui_io_lock->io_list_back = NULL;
    }
    if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
        rw_enter(&md_unit_array_rw.lock, RW_WRITER);
        MDI_VOIDUNIT(mnum) = (void *) ui;
        rw_exit(&md_unit_array_rw.lock);
    } else
        MDI_VOIDUNIT(mnum) = (void *) ui;

    rw_enter(&ops->md_link_rw.lock, RW_WRITER);
    ui->ui_link.ln_next = ops->md_head;
    ui->ui_link.ln_setno = setno;
    ui->ui_link.ln_id = mnum;
    ops->md_head = &ui->ui_link;
    /* setup the unavailable field */
#if defined(_ILP32)
    if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
        ui->ui_tstate |= MD_64MD_ON_32KERNEL;
        cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
            "metadevices are not accessible on a 32 bit kernel",
            mnum);
    }
#endif

    rw_exit(&ops->md_link_rw.lock);
}

void
md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
{
    mdi_unit_t  *ui;

    /*
     * ASSUMPTION: md_unit_array_rw WRITER lock is held.
     */
    ui = MDI_UNIT(mnum);
    if (ui == NULL)
        return;

    md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
        &ops->md_head);

    /* destroy the io lock if one is being used */
    if (ui->ui_io_lock) {
        mutex_destroy(&ui->ui_io_lock->io_mx);
        cv_destroy(&ui->ui_io_lock->io_cv);
        kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
    }

    /* teardown kstat */
    md_kstat_destroy(mnum);

    /* destroy all the incore conditional variables */
    mutex_destroy(&ui->ui_mx);
    cv_destroy(&ui->ui_cv);

    kmem_free(ui, sizeof (mdi_unit_t));
    MDI_VOIDUNIT(mnum) = (void *) NULL;
}

void
md_rem_names(sv_dev_t *sv, int nsv)
{
    int i, s;
    int max_sides;

    if (nsv == 0)
        return;

    /* All entries removed are in the same diskset */
    if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
        max_sides = MD_MNMAXSIDES;
    else
        max_sides = MD_MAXSIDES;

    for (i = 0; i < nsv; i++)
        for (s = 0; s < max_sides; s++)
            (void) md_remdevname(sv[i].setno, s, sv[i].key);
}

/*
 * Checking user args before we get into physio - returns 0 for ok, else errno
 * We do a lot of checking against illegal arguments here because some of the
 * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
 * like odd address user buffer.) Those drivers capture bad arguments in
 * xxread and xxwrite. But since meta-driver calls their strategy routines
 * directly, two bad scenario might happen:
 *  1. the real strategy doesn't like it and panic.
 *  2. the real strategy doesn't like it and set B_ERROR.
 *
 * The second case is no better than the first one, since the meta-driver
 * will treat it as a media-error and off line the mirror metapartition.
 * (Too bad there is no way to tell what error it is.)
 *
 */
int
md_chk_uio(struct uio *uio)
{
    int i;
    struct iovec *iov;

    /*
     * Check for negative or not block-aligned offset
     */
    if ((uio->uio_loffset < 0) ||
        ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
        return (EINVAL);
    }
    iov = uio->uio_iov;
    i = uio->uio_iovcnt;

    while (i--) {
        if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
            return (EINVAL);
        /*
         * Bug # 1212146
         * The default is to not check alignment, but we can now check
         * for a larger number of alignments if desired.
         */
        if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
            return (EINVAL);
        iov++;
    }
    return (0);
}

char *
md_shortname(
    minor_t     mnum
)
{
    static char buf[MAXPATHLEN];
    char        *devname;
    char        *invalid = " (Invalid minor number %u) ";
    char        *metaname;
    mdc_unit_t  *un;
    side_t      side;
    set_t       setno = MD_MIN2SET(mnum);
    unit_t      unit = MD_MIN2UNIT(mnum);

    if ((un = MD_UNIT(mnum)) == NULL) {
        (void) snprintf(buf, sizeof (buf), invalid, mnum);
        return (buf);
    }

    /*
     * If unit is not a friendly name unit, derive the name from the
     * minor number.
     */
    if ((un->un_revision & MD_FN_META_DEV) == 0) {
        /* This is a traditional metadevice */
        if (setno == MD_LOCAL_SET) {
            (void) snprintf(buf, sizeof (buf), "d%u",
                (unsigned)unit);
        } else {
            (void) snprintf(buf, sizeof (buf), "%s/d%u",
                mddb_getsetname(setno), (unsigned)unit);
        }
        return (buf);
    }

    /*
     * It is a friendly name metadevice, so we need to get its name.
     */
    side = mddb_getsidenum(setno);
    devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
    if (md_getdevname(setno, side, MD_KEYWILD,
        md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
        /*
         * md_getdevname has given us either /dev/md/dsk/<metaname>
         * or /dev/md/<setname>/dsk/<metname> depending on whether
         * or not we are in the local set.  Thus, we'll pull the
         * metaname from this string.
         */
        if ((metaname = strrchr(devname, '/')) == NULL) {
            (void) snprintf(buf, sizeof (buf), invalid, mnum);
            goto out;
        }
        metaname++; /* move past slash */
        if (setno == MD_LOCAL_SET) {
            /* No set name. */
            (void) snprintf(buf, sizeof (buf), "%s", metaname);
        } else {
            /* Include setname */
            (void) snprintf(buf, sizeof (buf), "%s/%s",
                mddb_getsetname(setno), metaname);
        }
    } else {
        /* We couldn't find the name. */
        (void) snprintf(buf, sizeof (buf), invalid, mnum);
    }

out:
    kmem_free(devname, MAXPATHLEN);
    return (buf);
}

char *
md_devname(
    set_t       setno,
    md_dev64_t  dev,
    char        *buf,
    size_t      size
)
{
    static char mybuf[MD_MAX_CTDLEN];
    int     err;

    if (buf == NULL) {
        buf = mybuf;
        size = sizeof (mybuf);
    } else {
        ASSERT(size >= MD_MAX_CTDLEN);
    }

    err = md_getdevname_common(setno, mddb_getsidenum(setno),
        0, dev, buf, size, MD_NOWAIT_LOCK);
    if (err) {
        if (err == ENOENT) {
            (void) sprintf(buf, "(Unavailable)");
        } else {
            (void) sprintf(buf, "(%u.%u)",
                md_getmajor(dev), md_getminor(dev));
        }
    }

    return (buf);
}
void
md_minphys(buf_t *pb)
{
    extern unsigned md_maxbcount;

    if (pb->b_bcount > md_maxbcount)
        pb->b_bcount = md_maxbcount;
}

void
md_bioinit(struct buf *bp)
{
    ASSERT(bp);

    bioinit(bp);
    bp->b_back = bp;
    bp->b_forw = bp;
    bp->b_flags = B_BUSY;   /* initialize flags */
}

void
md_bioreset(struct buf *bp)
{
    ASSERT(bp);

    bioreset(bp);
    bp->b_back = bp;
    bp->b_forw = bp;
    bp->b_flags = B_BUSY;   /* initialize flags */
}

/*
 * md_bioclone is needed as long as the real bioclone only takes a daddr_t
 * as block number.
 * We simply call bioclone with all input parameters but blkno, and set the
 * correct blkno afterwards.
 * Caveat Emptor: bp_mem must not be NULL!
 */
buf_t *
md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
        int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
{
    (void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
    bp_mem->b_lblkno = blkno;
    return (bp_mem);
}


/*
 * kstat stuff
 */
void
md_kstat_init_ui(
    minor_t      mnum,
    mdi_unit_t  *ui
)
{
    if ((ui != NULL) && (ui->ui_kstat == NULL)) {
        set_t   setno = MD_MIN2SET(mnum);
        unit_t  unit = MD_MIN2UNIT(mnum);
        char    module[KSTAT_STRLEN];
        char    *p = module;

        if (setno != MD_LOCAL_SET) {
            char    buf[64];
            char    *s = buf;
            char    *e = module + sizeof (module) - 4;

            (void) sprintf(buf, "%u", setno);
            while ((p < e) && (*s != '\0'))
                *p++ = *s++;
            *p++ = '/';
        }
        *p++ = 'm';
        *p++ = 'd';
        *p = '\0';
        if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
            KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
            ui->ui_kstat->ks_lock = &ui->ui_mx;
            kstat_install(ui->ui_kstat);
        }
    }
}

void
md_kstat_init(
    minor_t     mnum
)
{
    md_kstat_init_ui(mnum, MDI_UNIT(mnum));
}

void
md_kstat_destroy_ui(
    mdi_unit_t  *ui
)
{
    /*
     * kstat_delete() interface has it's own locking mechanism and
     * does not allow holding of kstat lock (ks_lock).
     * Note: ks_lock == ui_mx from the md_kstat_init_ui().
     */
    if ((ui != NULL) && (ui->ui_kstat != NULL)) {
        kstat_delete(ui->ui_kstat);
        ui->ui_kstat = NULL;
    }
}

void
md_kstat_destroy(
    minor_t     mnum
)
{
    md_kstat_destroy_ui(MDI_UNIT(mnum));
}

/*
 * In the following subsequent routines, locks are held before checking the
 * validity of ui_kstat. This is done to make sure that we don't trip over
 * a NULL ui_kstat anymore.
 */

void
md_kstat_waitq_enter(
    mdi_unit_t  *ui
)
{
    mutex_enter(&ui->ui_mx);
    if (ui->ui_kstat != NULL)
        kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
    mutex_exit(&ui->ui_mx);
}

void
md_kstat_waitq_to_runq(
    mdi_unit_t  *ui
)
{
    mutex_enter(&ui->ui_mx);
    if (ui->ui_kstat != NULL)
        kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
    mutex_exit(&ui->ui_mx);
}

void
md_kstat_waitq_exit(
    mdi_unit_t  *ui
)
{
    mutex_enter(&ui->ui_mx);
    if (ui->ui_kstat != NULL)
        kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
    mutex_exit(&ui->ui_mx);
}

void
md_kstat_runq_enter(
    mdi_unit_t  *ui
)
{
    mutex_enter(&ui->ui_mx);
    if (ui->ui_kstat != NULL)
        kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
    mutex_exit(&ui->ui_mx);
}

void
md_kstat_runq_exit(
    mdi_unit_t  *ui
)
{
    mutex_enter(&ui->ui_mx);
    if (ui->ui_kstat != NULL)
        kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
    mutex_exit(&ui->ui_mx);
}

void
md_kstat_done(
    mdi_unit_t  *ui,
    buf_t       *bp,
    int     war
)
{
    size_t  n_done;

    /* check for end of device */
    if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
        n_done = bp->b_bcount;
    } else if (bp->b_bcount < bp->b_resid) {
        n_done = 0;
    } else {
        n_done = bp->b_bcount - bp->b_resid;
    }

    /* do accounting */
    mutex_enter(&ui->ui_mx);
    if (ui->ui_kstat != NULL) {
        if ((! war) && (bp->b_flags & B_READ)) {
            KSTAT_IO_PTR(ui->ui_kstat)->reads++;
            KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
        } else {
            KSTAT_IO_PTR(ui->ui_kstat)->writes++;
            KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
        }
        kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
    }
    mutex_exit(&ui->ui_mx);
}

pid_t
md_getpid()
{
    pid_t valuep;
    if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
        ASSERT(0);
        return ((pid_t)0);
    } else {
        ASSERT(valuep);
        return (valuep);
    }
}


proc_t *
md_getproc()
{
    proc_t  *valuep;
    if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
        ASSERT(0);
        return ((proc_t *)NULL);
    } else {
        ASSERT(valuep);
        return (valuep);
    }
}

extern kmutex_t pidlock;

/*
 * this check to see if a process pid pair are still running.  For the
 * disk set lock when both pid/proc are zero then the locks is not
 * currently held.
 */
int
md_checkpid(pid_t pid, proc_t *proc)
{
    int retval = 1;

    if (pid == 0 && proc == NULL)
        return (0);

    mutex_enter(&pidlock);
    if (prfind(pid)  != proc)
        retval = 0;
    mutex_exit(&pidlock);
    return (retval);
}

/*
 * NAME: md_init_probereq
 *
 * DESCRIPTION: initializes a probe request. Parcels out the mnums such that
 *      they can be dispatched to multiple daemon threads.
 *
 * PARAMETERS: struct md_probedev *p    pointer ioctl input
 *
 * RETURN VALUE: Returns errno
 *
 */

int
md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
{
    int     err = 0;
    int     modindx;
    intptr_t    (*probe_test)();

    /*
     * Initialize the semaphores and mutex
     * for the request
     */

    p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);

    p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
    sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
    mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);

    modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
    probe_test = md_get_named_service(NODEV64, modindx,
        p->probe.test_name, 0);
    if (probe_test == NULL) {
        err = EINVAL;
        goto err_out;
    }

    err = md_create_probe_rqlist(p, hdrpp, probe_test);
err_out:
    return (err);
}

/*
 * NAME: md_probe_one
 *
 * DESCRIPTION: Generic routine for probing disks. This is called from the
 *      daemon.
 *
 * PARAMETERS: probe_req_t  *reqp   pointer to the probe request structure.
 *
 */

void
md_probe_one(probe_req_t *reqp)
{
    mdi_unit_t      *ui;
    md_probedev_impl_t  *p;
    int         err = 0;
    set_t           setno;

    p = (md_probedev_impl_t *)reqp->private_handle;
    /*
     * Validate the unit while holding the global ioctl lock, then
     * obtain the unit_writerlock. Once the writerlock has been obtained
     * we can release the global lock. As long as we hold one of these
     * locks this will prevent a metaclear operation being performed
     * on the metadevice because metaclear takes the readerlock (via
     * openclose lock).
     * To avoid a potential deadlock with the probe_fcn() causing i/o to
     * be issued to the writerlock'd metadevice we only grab the writerlock
     * if the unit is not an SVM root device.
     */
    while (md_ioctl_lock_enter() == EINTR)
        ;
    setno = MD_MIN2SET(reqp->mnum);
    ui = MDI_UNIT(reqp->mnum);
    if (ui != NULL) {
        int writer_grabbed;
        dev_t   svm_root;

        if ((setno == MD_LOCAL_SET) && root_is_svm) {
            svm_root = getrootdev();

            if (getminor(svm_root) == reqp->mnum) {
                writer_grabbed = 0;
            } else {
                writer_grabbed = 1;
                (void) md_unit_writerlock_common(ui, 0);
            }
        } else {
            writer_grabbed = 1;
            (void) md_unit_writerlock_common(ui, 0);
        }
        (void) md_ioctl_lock_exit(0, 0, 0, FALSE);
        err = (*reqp->probe_fcn)(ui, reqp->mnum);
        if (writer_grabbed) {
            md_unit_writerexit(ui);
        }
    } else {
        (void) md_ioctl_lock_exit(0, 0, 0, FALSE);
    }

    /* update the info in the probe structure */

    mutex_enter(PROBE_MX(p));
    if (err != 0) {
        cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
            reqp->mnum);
        (void) mdsyserror(&(p->probe.mde), err);
    }

    mutex_exit(PROBE_MX(p));
    sema_v(PROBE_SEMA(p));

    kmem_free(reqp, sizeof (probe_req_t));
}
char *
md_strdup(char *cp)
{
    char *new_cp = NULL;

    new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);

    return (strcpy(new_cp, cp));
}

void
freestr(char *cp)
{
    kmem_free(cp, strlen(cp) + 1);
}

/*
 * Validate the list and skip invalid devices. Then create
 * a doubly linked circular list of devices to probe.
 * The hdr points to the head and tail of this list.
 */

static int
md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
            intptr_t (*probe_test)())
{
    int i, err, nodevcnt;
    probe_req_t *tp;
    daemon_queue_t *hp;
    minor_t mnum;

    nodevcnt = 0;

    hp = NULL;

    for (i = 0; i <  plist->probe.nmdevs; i++) {
        mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
        if (MDI_UNIT(mnum) == NULL) {
            cmn_err(CE_WARN, "md: Cannot probe %s since it does "
                "not exist", md_shortname(mnum));
            nodevcnt++;
            continue;
        }
        tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
        tp->mnum = mnum;
        tp->private_handle = (void *)plist;
        tp->probe_fcn = probe_test;
        if (hp == NULL) {
            hp = (daemon_queue_t *)tp;
            hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
        } else {
            tp->dq.dq_next = hp;
            tp->dq.dq_prev = hp->dq_prev;
            hp->dq_prev->dq_next = (daemon_queue_t *)tp;
            hp->dq_prev = (daemon_queue_t *)tp;
        }
    }

    *hdr = hp;
    if (nodevcnt > 0)
        plist->probe.nmdevs -= nodevcnt;

    /*
     * If there are no devices to be probed because they were
     * incorrect, then return an error.
     */
    err = (plist->probe.nmdevs == 0) ? ENODEV : 0;

    return (err);
}

/*
 * This routine increments the I/O count for set I/O operations.  This
 * value is used to determine if an I/O can done.  If a release is in
 * process this will return an error and cause the I/O to be errored.
 */
int
md_inc_iocount(set_t setno)
{
    int rc = 0;

    if (setno == 0)
        return (0);

    mutex_enter(&md_set_io[setno].md_io_mx);
    if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
        rc = EIO;
        goto out;
    }

    ASSERT(md_set_io[setno].io_cnt >= 0);
    md_set_io[setno].io_cnt++;

out:    mutex_exit(&md_set_io[setno].md_io_mx);
    return (rc);
}

void
md_inc_iocount_noblock(set_t setno)
{

    if (setno == 0)
        return;

    mutex_enter(&md_set_io[setno].md_io_mx);
    md_set_io[setno].io_cnt++;
    mutex_exit(&md_set_io[setno].md_io_mx);
}
void
md_dec_iocount(set_t setno)
{

    if (setno == 0)
        return;

    mutex_enter(&md_set_io[setno].md_io_mx);
    md_set_io[setno].io_cnt--;
    ASSERT(md_set_io[setno].io_cnt >= 0);
    if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
        (md_set_io[setno].io_cnt == 0))
        cv_broadcast(&md_set_io[setno].md_io_cv);
    mutex_exit(&md_set_io[setno].md_io_mx);
}

int
md_isblock_setio(set_t setno)
{
    int rc = 0;

    if (setno == 0)
        return (0);

    mutex_enter(&md_set_io[setno].md_io_mx);
    if (md_set_io[setno].io_state & MD_SET_RELEASE)
        rc = 1;

    mutex_exit(&md_set_io[setno].md_io_mx);
    return (rc);
}

int
md_block_setio(set_t setno)
{
    int rc = 0;

    if (setno == 0)
        return (1);

    mutex_enter(&md_set_io[setno].md_io_mx);
    md_set_io[setno].io_state = MD_SET_RELEASE;

    while (md_set_io[setno].io_cnt > 0) {
        cv_wait(&md_set_io[setno].md_io_cv,
            &md_set_io[setno].md_io_mx);
    }
    rc = 1;


    ASSERT(md_set_io[setno].io_cnt == 0);
    mutex_exit(&md_set_io[setno].md_io_mx);

    return (rc);
}

void
md_clearblock_setio(set_t setno)
{
    if (setno == 0)
        return;

    mutex_enter(&md_set_io[setno].md_io_mx);
    md_set_io[setno].io_state = MD_SET_ACTIVE;
    mutex_exit(&md_set_io[setno].md_io_mx);
}

void
md_unblock_setio(set_t setno)
{
    if (setno == 0)
        return;

    mutex_enter(&md_set_io[setno].md_io_mx);
#ifdef DEBUG
    if (md_set_io[setno].io_cnt != 0) {
        cmn_err(CE_NOTE, "set %d count was %ld at take",
            setno, md_set_io[setno].io_cnt);
    }
#endif /* DEBUG */

    md_set_io[setno].io_state = MD_SET_ACTIVE;
    md_set_io[setno].io_cnt = 0;
    mutex_exit(&md_set_io[setno].md_io_mx);
}

/*
 * Test and set version of the md_block_setio.
 * Set the io_state to keep new I/O from being issued.
 * If there is I/O currently in progress, then set io_state to active
 * and return failure.  Otherwise, return a 1 for success.
 *
 * Used in a MN diskset since the commd must be suspended before
 * this node can attempt to withdraw from a diskset.  But, with commd
 * suspended, I/O may have been issued that can never finish until
 * commd is resumed (allocation of hotspare, etc). So, if I/O is
 * outstanding after diskset io_state is marked RELEASE, then set diskset
 * io_state back to ACTIVE and return failure.
 */
int
md_tas_block_setio(set_t setno)
{
    int rc;

    if (setno == 0)
        return (1);

    mutex_enter(&md_set_io[setno].md_io_mx);
    md_set_io[setno].io_state = MD_SET_RELEASE;

    if (md_set_io[setno].io_cnt > 0) {
        md_set_io[setno].io_state = MD_SET_ACTIVE;
        rc = 0;
    } else {
        rc = 1;
    }

    mutex_exit(&md_set_io[setno].md_io_mx);

    return (rc);
}

void
md_biodone(struct buf *pb)
{
    minor_t mnum;
    set_t   setno;
    mdi_unit_t  *ui;

    mnum = getminor(pb->b_edev);
    setno = MD_MIN2SET(mnum);

    if (setno == 0) {
        biodone(pb);
        return;
    }

#ifdef DEBUG
    ui = MDI_UNIT(mnum);
    if (!md_unit_isopen(ui))
        cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
#endif /* DEBUG */

    /*
     * Handle the local diskset
     */
    if (md_set_io[setno].io_cnt > 0)
        md_dec_iocount(setno);

#ifdef DEBUG
    /*
     * this is being done after the lock is dropped so there
     * are cases it may be invalid.  It is advisory.
     */
    if (md_set_io[setno].io_state & MD_SET_RELEASE) {
        /* Only display this error once for this metadevice */
        if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
            cmn_err(CE_NOTE,
                "I/O to %s attempted during set RELEASE\n",
                md_shortname(mnum));
            ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
        }
    }
#endif /* DEBUG */

    biodone(pb);
}


/*
 * Driver special private devt handling routine
 * INPUT:  md_dev64_t
 * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
 */
dev_t
md_dev64_to_dev(md_dev64_t dev)
{
    major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
    minor_t minor = (minor_t)(dev & MAXMIN64);

    return (makedevice(major, minor));

}

/*
 * Driver private makedevice routine
 * INPUT:  major_t major, minor_t minor
 * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
 */
md_dev64_t
md_makedevice(major_t major, minor_t minor)
{
    return (((md_dev64_t)major << NBITSMINOR64) | minor);

}


/*
 * Driver private devt md_getmajor routine
 * INPUT:  dev  a 64 bit container holding either a 32 bit or a 64 bit device
 * OUTPUT: the appropriate major number
 */
major_t
md_getmajor(md_dev64_t dev)
{
    major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;

    if (major == 0) {
        /* Here we were given a 32bit dev */
        major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
    }
    return (major);
}

/*
 * Driver private devt md_getminor routine
 * INPUT:  dev  a 64 bit container holding either a 32 bit or a 64 bit device
 * OUTPUT: the appropriate minor number
 */
minor_t
md_getminor(md_dev64_t dev)
{
    minor_t minor;
    major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;

    if (major == 0) {
        /* Here we were given a 32bit dev */
        minor = (minor_t)(dev & MAXMIN32);
    } else {
        minor = (minor_t)(dev & MAXMIN64);
    }
    return (minor);
}

int
md_check_ioctl_against_unit(int cmd, mdc_unit_t c)
{
    /*
     * If the metadevice is an old style device, it has a vtoc,
     *  in that case all reading EFI ioctls are not applicable.
     * If the metadevice has an EFI label, reading vtoc and geom ioctls
     *  are not supposed to work.
     */
    switch (cmd) {
        case DKIOCGGEOM:
        case DKIOCGAPART:
            /* if > 2 TB then fail */
            if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
                return (ENOTSUP);
            }
            break;
        case DKIOCGVTOC:
            /* if > 2 TB then fail */
            if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
                return (ENOTSUP);
            }

            /* if > 1 TB but < 2TB return overflow */
            if (c.un_revision & MD_64BIT_META_DEV) {
                return (EOVERFLOW);
            }
            break;
        case DKIOCGEXTVTOC:
            /* if > 2 TB then fail */
            if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
                return (ENOTSUP);
            }
            break;
        case DKIOCGETEFI:
        case DKIOCPARTITION:
            if ((c.un_flag & MD_EFILABEL) == 0) {
                return (ENOTSUP);
            }
            break;

        case DKIOCSETEFI:
        /* setting an EFI label should always be ok */
            return (0);

        case DKIOCSVTOC:
            /* if > 2 TB then fail */
            if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
                return (ENOTSUP);
            }

            /* if > 1 TB but < 2TB return overflow */
            if (c.un_revision & MD_64BIT_META_DEV) {
                return (EOVERFLOW);
            }
            break;
        case DKIOCSEXTVTOC:
            if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
                return (ENOTSUP);
            }
            break;
    }
    return (0);
}

/*
 * md_vtoc_to_efi_record()
 * Input:  record id of the vtoc record
 * Output: record id of the efi record
 * Function:
 *  - reads the  volume name from the vtoc record
 *  - converts the volume name to a format, libefi understands
 *  - creates a new record of size MD_EFI_PARTNAME_BYTES
 *  - stores the volname in that record,
 *  - commits that record
 *  - returns the recid of the efi record.
 * Caveat Emptor:
 *  The calling routine must do something like
 *  - un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
 *  - commit(un)
 *  - delete(vtoc_recid)
 *  in order to keep the mddb consistent in case of a panic in the middle.
 * Errors:
 *  - returns 0 on any error
 */
mddb_recid_t
md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
{
    struct vtoc *vtoc;
    ushort_t    *v;
    mddb_recid_t    efi_recid;
    int     i;

    if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
        return (0);
    }
    vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
    efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
        MD_CRO_32BIT, setno);
    if (efi_recid < 0) {
        return (0);
    }
    v = (ushort_t *)mddb_getrecaddr(efi_recid);

    /* This for loop read, converts and writes */
    for (i = 0; i < LEN_DKL_VVOL; i++) {
        v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
    }
    /* commit the new record */
    mddb_commitrec_wrapper(efi_recid);

    return (efi_recid);
}

/*
 * Send a kernel message.
 * user has to provide for an allocated result structure
 * If the door handler disappears we retry, emitting warnings every so often.
 *
 * The recipient argument is almost always unused, and is therefore typically
 * set to zero, as zero is an invalid cluster nodeid.  The exceptions are the
 * marking and clearing of the DRL from a node that is not currently the
 * owner.  In these cases, the recipient argument will be the nodeid of the
 * mirror owner, and MD_MSGF_DIRECTED will be set in the flags.  Non-owner
 * nodes will not receive these messages.
 *
 * For the case where md_mn_is_commd_present() is false, we rely on the
 * "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for
 * kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0.
 */
int
mdmn_ksend_message(
    set_t       setno,
    md_mn_msgtype_t type,
    uint_t      flags,
    md_mn_nodeid_t  recipient,
    char        *data,
    int     size,
    md_mn_kresult_t *result)
{
    door_arg_t  da;
    md_mn_kmsg_t    *kmsg;
    uint_t      send_try_cnt = 0;
    uint_t      retry_noise_cnt = 0;
    int     rval;
    k_sigset_t  oldmask, newmask;

    if (size > MDMN_MAX_KMSG_DATA)
        return (ENOMEM);
    kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
    kmsg->kmsg_flags = flags;
    kmsg->kmsg_setno = setno;
    kmsg->kmsg_recipient = recipient;
    kmsg->kmsg_type = type;
    kmsg->kmsg_size = size;
    bcopy(data, &(kmsg->kmsg_data), size);

    /*
     * Wait for the door handle to be established.
     */
    while (mdmn_door_did == -1) {
        if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
            cmn_err(CE_WARN, "door handle not yet ready. "
                "Check if /usr/lib/lvm/mddoors is running");
        }
        delay(md_hz);
    }

    /*
     * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
     * do not fail if the user process receives a signal while we're
     * active in the door interface.
     */
    if (flags & MD_MSGF_BLK_SIGNAL) {
        sigfillset(&newmask);
        sigreplace(&newmask, &oldmask);
    }

    /*
     * If message failed with an RPC_FAILURE when rpc.mdcommd had
     * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
     * then don't retry the message anymore.  If message
     * failed due to any other reason, then retry up to MD_MN_WARN_INTVL
     * times which should allow a shutting down system time to
     * notify the kernel of a graceful shutdown of rpc.mdcommd.
     *
     * Caller of this routine will need to check the md_mn_commd_present
     * flag and the failure error in order to determine whether to panic
     * or not.  If md_mn_commd_present is set to 0 and failure error
     * is RPC_FAILURE, the calling routine should not panic since the
     * system is in the process of being shutdown.
     *
     */

    retry_noise_cnt = send_try_cnt = 0;
    while (md_mn_is_commd_present_lite()) {
        /*
         * data_ptr and data_size are initialized here because on
         * return from the upcall, they contain data duplicated from
         * rbuf and rsize.  This causes subsequent upcalls to fail.
         */
        da.data_ptr = (char *)(kmsg);
        da.data_size = sizeof (md_mn_kmsg_t);
        da.desc_ptr = NULL;
        da.desc_num = 0;
        da.rbuf = (char *)result;
        da.rsize = sizeof (*result);

        while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
            NULL, SIZE_MAX, 0)) != 0) {
            if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
                if (rval == EAGAIN)  {
                    cmn_err(CE_WARN,
                        "md: door_upcall failed. "
                        "Check if mddoors is running.");
                } else if (rval == EINTR) {
                    cmn_err(CE_WARN,
                        "md: door_upcall failed. "
                        "Check if rpc.mdcommd is running.");
                } else {
                    cmn_err(CE_WARN,
                        "md: door_upcall failed. "
                        "Returned %d",
                        rval);
                }
            }
            if (++send_try_cnt >= md_send_retry_limit)
                break;

            delay(md_hz);

            /*
             * data_ptr and data_size are re-initialized here
             * because on return from the upcall, they contain
             * data duplicated from rbuf and rsize.  This causes
             * subsequent upcalls to fail.
             */
            da.data_ptr = (char *)(kmsg);
            da.data_size = sizeof (md_mn_kmsg_t);
            da.desc_ptr = NULL;
            da.desc_num = 0;
            da.rbuf = (char *)result;
            da.rsize = sizeof (*result);
        }


        /*
         * If:
         * - the send succeeded (MDMNE_ACK)
         * - we had an MDMNE_RPC_FAIL and commd is now gone
         *   (note: since the outer loop is commd-dependent,
         *   checking MDMN_RPC_FAIL here is meaningless)
         * - we were told not to retry
         * - we exceeded the RPC failure send limit
         * punch out of the outer loop prior to the delay()
         */
        if (result->kmmr_comm_state == MDMNE_ACK ||
            (flags & MD_MSGF_KSEND_NORETRY) ||
            (++send_try_cnt % md_send_retry_limit) == 0 ||
            !md_mn_is_commd_present())
            break;
        delay(md_hz);
    }

    if (flags & MD_MSGF_BLK_SIGNAL) {
        sigreplace(&oldmask, (k_sigset_t *)NULL);
    }
    kmem_free(kmsg, sizeof (md_mn_kmsg_t));

    return (0);
}

/*
 * Called to propagate the capability of a metadevice to all nodes in the set.
 *
 * On entry, lockp is set if the function has been called from within an ioctl.
 *
 * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
 * routine to enable other mdioctls to enter the kernel while this
 * thread of execution waits on the completion of mdmn_ksend_message. When
 * the message is completed the thread continues and md_ioctl_lock must be
 * reacquired.  Even though md_ioctl_lock is interruptable, we choose to
 * ignore EINTR as we must not return without acquiring md_ioctl_lock.
 */

int
mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
{
    md_mn_msg_setcap_t  msg;
    md_mn_kresult_t     *kres;
    mdi_unit_t      *ui = MDI_UNIT(mnum);
    int         ret;
    k_sigset_t      oldmask, newmask;

    (void) strncpy((char *)&msg.msg_setcap_driver,
        md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
    msg.msg_setcap_mnum = mnum;
    msg.msg_setcap_set = vc.vc_set;

    if (lockp)
        IOLOCK_RETURN_RELEASE(0, lockp);
    kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);

    /*
     * Mask signals for the mdmd_ksend_message call.  This keeps the door
     * interface from failing if the user process receives a signal while
     * in mdmn_ksend_message.
     */
    sigfillset(&newmask);
    sigreplace(&newmask, &oldmask);
    ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
        MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
        kres));
    sigreplace(&oldmask, (k_sigset_t *)NULL);

    if (!MDMN_KSEND_MSG_OK(ret, kres)) {
        mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
        ret = EIO;
    }
    kmem_free(kres, sizeof (md_mn_kresult_t));

    if (lockp) {
        IOLOCK_RETURN_REACQUIRE(lockp);
    }
    return (ret);
}

/*
 * Called to clear all of the transient capabilities for a metadevice when it is
 * not open on any node in the cluster
 * Called from close for mirror and sp.
 */

void
mdmn_clear_all_capabilities(minor_t mnum)
{
    md_isopen_t clumsg;
    int     ret;
    md_mn_kresult_t *kresult;
    volcap_t    vc;
    k_sigset_t  oldmask, newmask;

    clumsg.dev = md_makedevice(md_major, mnum);
    clumsg.mde = mdnullerror;
    /*
     * The check open message doesn't have to be logged, nor should the
     * result be stored in the MCT. We want an up-to-date state.
     */
    kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);

    /*
     * Mask signals for the mdmd_ksend_message call.  This keeps the door
     * interface from failing if the user process receives a signal while
     * in mdmn_ksend_message.
     */
    sigfillset(&newmask);
    sigreplace(&newmask, &oldmask);
    ret = mdmn_ksend_message(MD_MIN2SET(mnum),
        MD_MN_MSG_CLU_CHECK,
        MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
        (char *)&clumsg, sizeof (clumsg), kresult);
    sigreplace(&oldmask, (k_sigset_t *)NULL);

    if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
        /*
         * Not open on any node, clear all capabilities, eg ABR and
         * DMR
         */
        vc.vc_set = 0;
        (void) mdmn_send_capability_message(mnum, vc, NULL);
    }
    kmem_free(kresult, sizeof (md_mn_kresult_t));
}

/*
 * mdmn_ksend_show_error:
 * ---------------------
 * Called to display the error contents of a failing mdmn_ksend_message() result
 *
 * Input:
 *  rv  - return value from mdmn_ksend_message()
 *  kres    - pointer to result structure filled in by mdmn_ksend_message
 *  s   - Informative message to identify failing condition (e.g.
 *        "Ownership change") This string will be displayed with
 *        cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
 *        administrator
 */
void
mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
{
    if (rv == 0) {
        cmn_err(CE_WARN, "%s *FAILED*", s);
        cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
            " = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
            kres->kmmr_failing_node);
    } else {
        cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
    }
}

/*
 * Callback routine for resync thread. If requested to suspend we mark the
 * commd as not being present.
 */
boolean_t
callb_md_mrs_cpr(void *arg, int code)
{
    callb_cpr_t *cp = (callb_cpr_t *)arg;
    int ret = 0;                /* assume success */
    clock_t delta;

    mutex_enter(cp->cc_lockp);

    switch (code) {
    case CB_CODE_CPR_CHKPT:
        /*
         * Mark the rpc.mdcommd as no longer present. We are trying to
         * suspend the system and so we should expect RPC failures to
         * occur.
         */
        md_mn_clear_commd_present();
        cp->cc_events |= CALLB_CPR_START;
        delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
        while (!(cp->cc_events & CALLB_CPR_SAFE))
            /* cv_timedwait() returns -1 if it times out. */
            if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
                cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
                break;
            break;

    case CB_CODE_CPR_RESUME:
        cp->cc_events &= ~CALLB_CPR_START;
        cv_signal(&cp->cc_stop_cv);
        break;
    }
    mutex_exit(cp->cc_lockp);
    return (ret != -1);
}


void
md_rem_hspname(set_t setno, mdkey_t n_key)
{
    int s;
    int max_sides;


    /* All entries removed are in the same diskset */
    if (md_get_setstatus(setno) & MD_SET_MNSET)
        max_sides = MD_MNMAXSIDES;
    else
        max_sides = MD_MAXSIDES;

    for (s = 0; s < max_sides; s++)
        (void) md_remdevname(setno, s, n_key);
}


int
md_rem_selfname(minor_t selfid)
{
    int s;
    set_t   setno = MD_MIN2SET(selfid);
    int max_sides;
    md_dev64_t  dev;
    struct nm_next_hdr  *nh;
    struct nm_name  *n;
    mdkey_t key;

    /*
     * Get the key since remove routine expects it
     */
    dev = md_makedevice(md_major, selfid);
    if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
        return (ENOENT);
    }

    if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
        MD_KEYWILD, dev, 0L)) == NULL) {
        return (ENOENT);
    }

    /* All entries removed are in the same diskset */
    key = n->n_key;
    if (md_get_setstatus(setno) & MD_SET_MNSET)
        max_sides = MD_MNMAXSIDES;
    else
        max_sides = MD_MAXSIDES;

    for (s = 0; s < max_sides; s++)
        (void) md_remdevname(setno, s, key);

    return (0);
}

void
md_upd_set_unnext(set_t setno, unit_t un)
{
    if (un < md_set[setno].s_un_next) {
        md_set[setno].s_un_next = un;
    }
}

struct hot_spare_pool *
find_hot_spare_pool(set_t setno, int hsp_id)
{
    hot_spare_pool_t *hsp;

    hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
    while (hsp != NULL) {
        if (hsp->hsp_self_id == hsp_id)
            return (hsp);
        hsp = hsp->hsp_next;
    }

    return ((hot_spare_pool_t *)0);
}

/*
 * md_create_taskq:
 *
 * Create a kernel taskq for the given set/unit combination. This is typically
 * used to complete a RR_CLEAN request when the callee is unable to obtain the
 * mutex / condvar access required to update the DRL safely.
 */
void *
md_create_taskq(set_t setno, minor_t mnum)
{
    char            name[20];
    ddi_taskq_t     *tqp;

    (void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));

    tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);

    return ((void *)tqp);
}