lvm/md/md.c

	md.c revision 193974072f41a843678abf5f61979c748687e66b
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Md - is the meta-disk driver.   It sits below the UFS file system
 * but above the 'real' disk drivers, xy, id, sd etc.
 *
 * To the UFS software, md looks like a normal driver, since it has
 * the normal kinds of entries in the bdevsw and cdevsw arrays. So
 * UFS accesses md in the usual ways.  In particular, the strategy
 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
 * and ufs_writelbn().
 *
 * Md maintains an array of minor devices (meta-partitions).   Each
 * meta partition stands for a matrix of real partitions, in rows
 * which are not necessarily of equal length.   Md maintains a table,
 * with one entry for each meta-partition,  which lists the rows and
 * columns of actual partitions, and the job of the strategy routine
 * is to translate from the meta-partition device and block numbers
 * known to UFS into the actual partitions' device and block numbers.
 *
 * See below, in mdstrategy(), mdreal(), and mddone() for details of
 * this translation.
 */

/*
 * Driver for Virtual Disk.
 */

#include <sys/user.h>
#include <sys/sysmacros.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/open.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/cmn_err.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/debug.h>
#include <sys/utsname.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_names.h>
#include <sys/lvm/md_mddb.h>
#include <sys/lvm/md_sp.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/cladm.h>
#include <sys/priv_names.h>
#include <sys/modhash.h>

#ifndef lint
char        _depends_on[] = "strmod/rpcmod";
#endif  /* lint */
int     md_init_debug   = 0;    /* module binding debug */

/*
 * Tunable to turn off the failfast behavior.
 */
int     md_ff_disable = 0;

/*
 * dynamically allocated list of non FF driver names - needs to
 * be freed when md is detached.
 */
char    **non_ff_drivers = NULL;

md_krwlock_t    md_unit_array_rw;   /* protects all unit arrays */
md_krwlock_t    nm_lock;        /* protects all the name spaces */

md_resync_t md_cpr_resync;

extern char svm_bootpath[];
#define SVM_PSEUDO_STR  "/pseudo/md@0:"

#define     VERSION_LENGTH  6
#define     VERSION     "1.0"

/*
 * Keep track of possible 'orphan' entries in the name space
 */
int     *md_nm_snarfed = NULL;

/*
 * Global tunable giving the percentage of free space left in replica during
 * conversion of non-devid style replica to devid style replica.
 */
int     md_conv_perc = MDDB_DEVID_CONV_PERC;

#ifdef  DEBUG
/* debug code to verify framework exclusion guarantees */
int     md_in;
kmutex_t    md_in_mx;           /* used to md global stuff */
#define IN_INIT     0x01
#define IN_FINI     0x02
#define IN_ATTACH   0x04
#define IN_DETACH   0x08
#define IN_OPEN     0x10
#define MD_SET_IN(x) {                      \
    mutex_enter(&md_in_mx);                 \
    if (md_in)                      \
        debug_enter("MD_SET_IN exclusion lost");    \
    if (md_in & x)                      \
        debug_enter("MD_SET_IN already set");       \
    md_in |= x;                     \
    mutex_exit(&md_in_mx);                  \
}

#define MD_CLR_IN(x) {                      \
    mutex_enter(&md_in_mx);                 \
    if (md_in & ~(x))                   \
        debug_enter("MD_CLR_IN exclusion lost");    \
    if (!(md_in & x))                   \
        debug_enter("MD_CLR_IN already clr");       \
    md_in &= ~x;                        \
    mutex_exit(&md_in_mx);                  \
}
#else   /* DEBUG */
#define MD_SET_IN(x)
#define MD_CLR_IN(x)
#endif  /* DEBUG */
hrtime_t savetime1, savetime2;


/*
 * list things protected by md_mx even if they aren't
 * used in this file.
 */
kmutex_t    md_mx;          /* used to md global stuff */
kcondvar_t  md_cv;          /* md_status events */
int     md_status = 0;      /* global status for the meta-driver */
int     md_num_daemons = 0;
int     md_ioctl_cnt = 0;
int     md_mtioctl_cnt = 0; /* multithreaded ioctl cnt */
uint_t      md_mdelay = 10;     /* variable so can be patched */

int     (*mdv_strategy_tstpnt)(buf_t *, int, void*);

major_t     md_major, md_major_targ;

unit_t      md_nunits = MD_MAXUNITS;
set_t       md_nsets = MD_MAXSETS;
int     md_nmedh = 0;
char        *md_med_trans_lst = NULL;
md_set_t    md_set[MD_MAXSETS];
md_set_io_t md_set_io[MD_MAXSETS];

md_krwlock_t    hsp_rwlp;       /* protects hot_spare_interface */
md_krwlock_t    ni_rwlp;        /* protects notify_interface */
md_ops_t    **md_ops = NULL;
ddi_modhandle_t *md_mods = NULL;
md_ops_t    *md_opslist;
clock_t     md_hz;
md_event_queue_t    *md_event_queue = NULL;

int     md_in_upgrade;
int     md_keep_repl_state;
int     md_devid_destroy;

/* for sending messages thru a door to userland */
door_handle_t   mdmn_door_handle = NULL;
int     mdmn_door_did = -1;

dev_info_t      *md_devinfo = NULL;

md_mn_nodeid_t  md_mn_mynode_id = ~0u;  /* My node id (for multi-node sets) */

static  uint_t      md_ocnt[OTYPCNT];

static int      mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int      mdattach(dev_info_t *, ddi_attach_cmd_t);
static int      mddetach(dev_info_t *, ddi_detach_cmd_t);
static int      mdopen(dev_t *, int, int, cred_t *);
static int      mdclose(dev_t, int, int, cred_t *);
static int      mddump(dev_t, caddr_t, daddr_t, int);
static int      mdread(dev_t, struct uio *, cred_t *);
static int      mdwrite(dev_t, struct uio *, cred_t *);
static int      mdaread(dev_t, struct aio_req *, cred_t *);
static int      mdawrite(dev_t, struct aio_req *, cred_t *);
static int      mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
static int      mdprop_op(dev_t, dev_info_t *,
                ddi_prop_op_t, int, char *, caddr_t, int *);

static struct cb_ops md_cb_ops = {
    mdopen,         /* open */
    mdclose,        /* close */
    mdstrategy,     /* strategy */
                /* print routine -- none yet */
    (int(*)(dev_t, char *))nulldev,
    mddump,         /* dump */
    mdread,         /* read */
    mdwrite,        /* write */
    mdioctl,        /* ioctl */
                /* devmap */
    (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
            uint_t))nodev,
                /* mmap */
    (int(*)(dev_t, off_t, int))nodev,
                /* segmap */
    (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
        unsigned, unsigned, cred_t *))nodev,
    nochpoll,       /* poll */
    mdprop_op,      /* prop_op */
    0,          /* streamtab */
    (D_64BIT|D_MP|D_NEW),   /* driver compatibility flag */
    CB_REV,         /* cb_ops version */
    mdaread,        /* aread */
    mdawrite,       /* awrite */
};

static struct dev_ops md_devops = {
    DEVO_REV,       /* dev_ops version */
    0,          /* device reference count */
    mdinfo,         /* info routine */
    nulldev,        /* identify routine */
    nulldev,        /* probe - not defined */
    mdattach,       /* attach routine */
    mddetach,       /* detach routine */
    nodev,          /* reset - not defined */
    &md_cb_ops,     /* driver operations */
    NULL,           /* bus operations */
    nodev,          /* power management */
    ddi_quiesce_not_needed,     /* quiesce */
};

/*
 * loadable module wrapper
 */
#include <sys/modctl.h>

static struct modldrv modldrv = {
    &mod_driverops,         /* type of module -- a pseudodriver */
    "Solaris Volume Manager base module", /* name of the module */
    &md_devops,         /* driver ops */
};

static struct modlinkage modlinkage = {
    MODREV_1,
    (void *)&modldrv,
    NULL
};


/* md_medd.c */
extern  void    med_init(void);
extern  void    med_fini(void);
extern  void    md_devid_cleanup(set_t, uint_t);

/* md_names.c */
extern void         *lookup_entry(struct nm_next_hdr *, set_t,
                    side_t, mdkey_t, md_dev64_t, int);
extern struct nm_next_hdr   *get_first_record(set_t, int, int);
extern int          remove_entry(struct nm_next_hdr *,
                    side_t, mdkey_t, int);

int     md_maxphys  = 0;    /* maximum io size in bytes */
#define     MD_MAXBCOUNT    (1024 * 1024)
unsigned    md_maxbcount    = 0;    /* maximum physio size in bytes */

/*
 * Some md ioctls trigger io framework device tree operations.  An
 * example is md ioctls that call md_resolve_bydevid(): which uses the
 * io framework to resolve a devid. Such operations result in acquiring
 * io framework locks (like ndi_devi_enter() of "/") while holding
 * driver locks (like md_unit_writerlock()).
 *
 * The prop_op(9E) entry point is called from the devinfo driver with
 * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
 * implementation must avoid taking a lock that is held per above md
 * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
 * without risking deadlock.
 *
 * To service "size" requests without risking deadlock, we maintain a
 * "mnum->nblocks" sizemap (protected by a short-term global mutex).
 */
static kmutex_t     md_nblocks_mutex;
static mod_hash_t   *md_nblocksmap;     /* mnum -> nblocks */
int         md_nblocksmap_size = 512;

/*
 * Maintain "mnum->nblocks" sizemap for mdprop_op use:
 *
 * Create: any code that establishes a unit's un_total_blocks needs the
 * following type of call to establish nblocks for mdprop_op():
 *  md_nblocks_set(mnum, un->c.un_total_blocks);"
 *  NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
 *      ...or  "MD_UNIT..*="
 *
 * Change: any code that changes a unit's un_total_blocks needs the
 * following type of call to sync nblocks for mdprop_op():
 *  md_nblocks_set(mnum, un->c.un_total_blocks);"
 *  NOTE: locate via cscope for "un_total_blocks[ \t]*="
 *
 * Destroy: any code that deletes a unit needs the following type of call
 * to sync nblocks for mdprop_op():
 *  md_nblocks_set(mnum, -1ULL);
 *  NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
 *      ...or  "MD_UNIT..*="
 */
void
md_nblocks_set(minor_t mnum, uint64_t nblocks)
{
    mutex_enter(&md_nblocks_mutex);
    if (nblocks == -1ULL)
        (void) mod_hash_destroy(md_nblocksmap,
            (mod_hash_key_t)(intptr_t)mnum);
    else
        (void) mod_hash_replace(md_nblocksmap,
            (mod_hash_key_t)(intptr_t)mnum,
            (mod_hash_val_t)(intptr_t)nblocks);
    mutex_exit(&md_nblocks_mutex);
}

/* get the size of a mnum from "mnum->nblocks" sizemap */
uint64_t
md_nblocks_get(minor_t mnum)
{
    mod_hash_val_t  hv;

    mutex_enter(&md_nblocks_mutex);
    if (mod_hash_find(md_nblocksmap,
        (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
        mutex_exit(&md_nblocks_mutex);
        return ((uint64_t)(intptr_t)hv);
    }
    mutex_exit(&md_nblocks_mutex);
    return (0);
}

/* allocate/free dynamic space associated with driver globals */
void
md_global_alloc_free(int alloc)
{
    set_t   s;

    if (alloc) {
        /* initialize driver global locks */
        cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
        mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
        rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
        rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
        rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
        rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
        mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
            MUTEX_DEFAULT, NULL);
        mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);

        /* initialize per set driver global locks */
        for (s = 0; s < MD_MAXSETS; s++) {
            /* initialize per set driver globals locks */
            mutex_init(&md_set[s].s_dbmx,
                NULL, MUTEX_DEFAULT, NULL);
            mutex_init(&md_set_io[s].md_io_mx,
                NULL, MUTEX_DEFAULT, NULL);
            cv_init(&md_set_io[s].md_io_cv,
                NULL, CV_DEFAULT, NULL);
        }
    } else {
        /* destroy per set driver global locks */
        for (s = 0; s < MD_MAXSETS; s++) {
            cv_destroy(&md_set_io[s].md_io_cv);
            mutex_destroy(&md_set_io[s].md_io_mx);
            mutex_destroy(&md_set[s].s_dbmx);
        }

        /* destroy driver global locks */
        mutex_destroy(&md_nblocks_mutex);
        mutex_destroy(&md_cpr_resync.md_resync_mutex);
        rw_destroy(&hsp_rwlp.lock);
        rw_destroy(&ni_rwlp.lock);
        rw_destroy(&nm_lock.lock);
        rw_destroy(&md_unit_array_rw.lock);
        mutex_destroy(&md_mx);
        cv_destroy(&md_cv);
    }
}

int
_init(void)
{
    set_t   s;
    int err;

    MD_SET_IN(IN_INIT);

    /* allocate dynamic space associated with driver globals */
    md_global_alloc_free(1);

    /* initialize driver globals */
    md_major = ddi_name_to_major("md");
    md_hz = drv_usectohz(NUM_USEC_IN_SEC);

    /* initialize tunable globals */
    if (md_maxphys == 0)        /* maximum io size in bytes */
        md_maxphys = maxphys;
    if (md_maxbcount == 0)      /* maximum physio size in bytes */
        md_maxbcount = MD_MAXBCOUNT;

    /* initialize per set driver globals */
    for (s = 0; s < MD_MAXSETS; s++)
        md_set_io[s].io_state = MD_SET_ACTIVE;

    /*
     * NOTE: the framework does not currently guarantee exclusion
     * between _init and attach after calling mod_install.
     */
    MD_CLR_IN(IN_INIT);
    if ((err = mod_install(&modlinkage))) {
        MD_SET_IN(IN_INIT);
        md_global_alloc_free(0);    /* free dynamic space */
        MD_CLR_IN(IN_INIT);
    }
    return (err);
}

int
_fini(void)
{
    int err;

    /*
     * NOTE: the framework currently does not guarantee exclusion
     * with attach until after mod_remove returns 0.
     */
    if ((err = mod_remove(&modlinkage)))
        return (err);

    MD_SET_IN(IN_FINI);
    md_global_alloc_free(0);    /* free dynamic space */
    MD_CLR_IN(IN_FINI);
    return (err);
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}

/* ARGSUSED */
static int
mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
    int len;
    unit_t  i;
    size_t  sz;
    char    ver[VERSION_LENGTH];
    char    **maj_str_array;
    char    *str, *str2;

    MD_SET_IN(IN_ATTACH);
    md_in_upgrade = 0;
    md_keep_repl_state = 0;
    md_devid_destroy = 0;

    if (cmd != DDI_ATTACH) {
        MD_CLR_IN(IN_ATTACH);
        return (DDI_FAILURE);
    }

    if (md_devinfo != NULL) {
        MD_CLR_IN(IN_ATTACH);
        return (DDI_FAILURE);
    }

    mddb_init();

    if (md_start_daemons(TRUE)) {
        MD_CLR_IN(IN_ATTACH);
        mddb_unload();      /* undo mddb_init() allocations */
        return (DDI_FAILURE);
    }

    /* clear the halted state */
    md_clr_status(MD_GBL_HALTED);

    /* see if the diagnostic switch is on */
    if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
        DDI_PROP_DONTPASS, "md_init_debug", 0))
        md_init_debug++;

    /* see if the failfast disable switch is on */
    if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
        DDI_PROP_DONTPASS, "md_ff_disable", 0))
        md_ff_disable++;

    /* try and get the md_nmedh property */
    md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
        DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
    if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
        md_nmedh = MED_DEF_HOSTS;

    /* try and get the md_med_trans_lst property */
    len = 0;
    if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
        0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
        len == 0) {
        md_med_trans_lst = md_strdup("tcp");
    } else {
        md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
        if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
            0, "md_med_trans_lst", md_med_trans_lst, &len) !=
            DDI_PROP_SUCCESS) {
            kmem_free(md_med_trans_lst, (size_t)len);
            md_med_trans_lst = md_strdup("tcp");
        }
    }

    /*
     * Must initialize the internal data structures before the
     * any possible calls to 'goto attach_failure' as _fini
     * routine references them.
     */
    med_init();

    md_ops = (md_ops_t **)kmem_zalloc(
        sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
    md_mods = (ddi_modhandle_t *)kmem_zalloc(
        sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);

    /* try and get the md_xlate property */
    /* Should we only do this if upgrade? */
    len = sizeof (char) * 5;
    if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
        0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
        if (strcmp(ver, VERSION) == 0) {
            len = 0;
            if (ddi_prop_op(DDI_DEV_T_ANY, dip,
                PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
                (caddr_t)&md_tuple_table, &len) !=
                DDI_PROP_SUCCESS) {
                if (md_init_debug)
                    cmn_err(CE_WARN,
                        "md_xlate ddi_prop_op failed");
                goto attach_failure;
            } else {
                md_tuple_length =
                    len/(2 * ((int)sizeof (dev32_t)));
                md_in_upgrade = 1;
            }

            /* Get target's name to major table */
            if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
                dip, DDI_PROP_DONTPASS,
                "md_targ_nm_table", &maj_str_array,
                &md_majortab_len) != DDI_PROP_SUCCESS) {
                md_majortab_len = 0;
                if (md_init_debug)
                    cmn_err(CE_WARN, "md_targ_nm_table "
                        "ddi_prop_lookup_string_array "
                        "failed");
                goto attach_failure;
            }

            md_major_tuple_table =
                (struct md_xlate_major_table *)
                kmem_zalloc(md_majortab_len *
                sizeof (struct md_xlate_major_table), KM_SLEEP);

            for (i = 0; i < md_majortab_len; i++) {
                /* Getting major name */
                str = strchr(maj_str_array[i], ' ');
                if (str == NULL)
                    continue;
                *str = '\0';
                md_major_tuple_table[i].drv_name =
                    md_strdup(maj_str_array[i]);

                /* Simplified atoi to get major number */
                str2 = str + 1;
                md_major_tuple_table[i].targ_maj = 0;
                while ((*str2 >= '0') && (*str2 <= '9')) {
                    md_major_tuple_table[i].targ_maj *= 10;
                    md_major_tuple_table[i].targ_maj +=
                        *str2++ - '0';
                }
                *str = ' ';
            }
            ddi_prop_free((void *)maj_str_array);
        } else {
            if (md_init_debug)
                cmn_err(CE_WARN, "md_xlate_ver is incorrect");
            goto attach_failure;
        }
    }

    /*
     * Check for properties:
     *  md_keep_repl_state and md_devid_destroy
     * and set globals if these exist.
     */
    md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
        0, "md_keep_repl_state", 0);

    md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
        0, "md_devid_destroy", 0);

    if (MD_UPGRADE)
        md_major_targ = md_targ_name_to_major("md");
    else
        md_major_targ = 0;

    /* allocate admin device node */
    if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
        MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
        goto attach_failure;

    if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
        DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
        goto attach_failure;

    if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
        "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
        goto attach_failure;

    /* these could have been cleared by a detach */
    md_nunits = MD_MAXUNITS;
    md_nsets = MD_MAXSETS;

    sz = sizeof (void *) * MD_MAXUNITS;
    if (md_set[0].s_un == NULL)
        md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
    if (md_set[0].s_ui == NULL)
        md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);

    md_devinfo = dip;

    /*
     * Only allocate device node for root mirror metadevice.
     * Don't pre-allocate unnecessary device nodes (thus slowing down a
     * boot when we attach).
     * We can't read the mddbs in attach.  The mddbs will be read
     * by metainit during the boot process when it is doing the
     * auto-take processing and any other minor nodes will be
     * allocated at that point.
     *
     * There are two scenarios to be aware of here:
     * 1) when we are booting from a mirrored root we need the root
     *    metadevice to exist very early (during vfs_mountroot processing)
     * 2) we need all of the nodes to be created so that any mnttab entries
     *    will succeed (handled by metainit reading the mddb during boot).
     */
    if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
        == 0) {
        char *p;
        int mnum = 0;

        /*
         * The svm_bootpath string looks something like
         * /pseudo/md@0:0,150,blk where 150 is the minor number
         * in this example so we need to set the pointer p onto
         * the first digit of the minor number and convert it
         * from ascii.
         */
        for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
            *p >= '0' && *p <= '9'; p++) {
            mnum *= 10;
            mnum += *p - '0';
        }

        if (md_create_minor_node(0, mnum)) {
            kmem_free(md_set[0].s_un, sz);
            kmem_free(md_set[0].s_ui, sz);
            goto attach_failure;
        }
    }

    /* create the hash to store the meta device sizes */
    md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
        md_nblocksmap_size, mod_hash_null_valdtor);

    MD_CLR_IN(IN_ATTACH);
    return (DDI_SUCCESS);

attach_failure:
    /*
     * Use our own detach routine to toss any stuff we allocated above.
     * NOTE: detach will call md_halt to free the mddb_init allocations.
     */
    MD_CLR_IN(IN_ATTACH);
    if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
        cmn_err(CE_WARN, "detach from attach failed");
    return (DDI_FAILURE);
}

/* ARGSUSED */
static int
mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
    extern int  check_active_locators();
    set_t       s;
    size_t      sz;
    int     len;

    MD_SET_IN(IN_DETACH);

    /* check command */
    if (cmd != DDI_DETACH) {
        MD_CLR_IN(IN_DETACH);
        return (DDI_FAILURE);
    }

    /*
     * if we have not already halted yet we have no active config
     * then automatically initiate a halt so we can detach.
     */
    if (!(md_get_status() & MD_GBL_HALTED)) {
        if (check_active_locators() == 0) {
            /*
             * NOTE: a successful md_halt will have done the
             * mddb_unload to free allocations done in mddb_init
             */
            if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
                cmn_err(CE_NOTE, "md:detach: "
                    "Could not halt Solaris Volume Manager");
                MD_CLR_IN(IN_DETACH);
                return (DDI_FAILURE);
            }
        }

        /* fail detach if we have not halted */
        if (!(md_get_status() & MD_GBL_HALTED)) {
            MD_CLR_IN(IN_DETACH);
            return (DDI_FAILURE);
        }
    }

    /* must be in halted state, this will be cleared on next attach */
    ASSERT(md_get_status() & MD_GBL_HALTED);

    /* cleanup attach allocations and initializations */
    md_major_targ = 0;

    sz = sizeof (void *) * md_nunits;
    for (s = 0; s < md_nsets; s++) {
        if (md_set[s].s_un != NULL) {
            kmem_free(md_set[s].s_un, sz);
            md_set[s].s_un = NULL;
        }

        if (md_set[s].s_ui != NULL) {
            kmem_free(md_set[s].s_ui, sz);
            md_set[s].s_ui = NULL;
        }
    }
    md_nunits = 0;
    md_nsets = 0;
    md_nmedh = 0;

    if (non_ff_drivers != NULL) {
        int i;

        for (i = 0; non_ff_drivers[i] != NULL; i++)
            kmem_free(non_ff_drivers[i],
                strlen(non_ff_drivers[i]) + 1);

        /* free i+1 entries because there is a null entry at list end */
        kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
        non_ff_drivers = NULL;
    }

    if (md_med_trans_lst != NULL) {
        kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
        md_med_trans_lst = NULL;
    }

    if (md_mods != NULL) {
        kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
        md_mods = NULL;
    }

    if (md_ops != NULL) {
        kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
        md_ops = NULL;
    }

    if (MD_UPGRADE) {
        len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
        md_in_upgrade = 0;
        md_xlate_free(len);
        md_majortab_free();
    }

    /*
     * Undo what we did in mdattach, freeing resources
     * and removing things we installed.  The system
     * framework guarantees we are not active with this devinfo
     * node in any other entry points at this time.
     */
    ddi_prop_remove_all(dip);
    ddi_remove_minor_node(dip, NULL);

    med_fini();

    mod_hash_destroy_idhash(md_nblocksmap);

    md_devinfo = NULL;

    MD_CLR_IN(IN_DETACH);
    return (DDI_SUCCESS);
}


/*
 * Given the device number return the devinfo pointer
 * given to md via md_attach
 */
/*ARGSUSED*/
static int
mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
    int     error = DDI_FAILURE;

    switch (infocmd) {
    case DDI_INFO_DEVT2DEVINFO:
        if (md_devinfo) {
            *result = (void *)md_devinfo;
            error = DDI_SUCCESS;
        }
        break;

    case DDI_INFO_DEVT2INSTANCE:
        *result = (void *)0;
        error = DDI_SUCCESS;
        break;
    }
    return (error);
}

/*
 * property operation routine.  return the number of blocks for the partition
 * in question or forward the request to the property facilities.
 */
static int
mdprop_op(
    dev_t dev,      /* device number associated with device */
    dev_info_t *dip,    /* device info struct for this device */
    ddi_prop_op_t prop_op,  /* property operator */
    int mod_flags,      /* property flags */
    char *name,     /* name of property */
    caddr_t valuep,     /* where to put property value */
    int *lengthp)       /* put length of property here */
{
    return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
        name, valuep, lengthp, md_nblocks_get(getminor(dev))));
}

static void
snarf_user_data(set_t setno)
{
    mddb_recid_t        recid;
    mddb_recstatus_t    status;

    recid = mddb_makerecid(setno, 0);
    while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
        if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
            continue;

        status = mddb_getrecstatus(recid);
        if (status == MDDB_STALE)
            continue;

        if (status == MDDB_NODATA) {
            mddb_setrecprivate(recid, MD_PRV_PENDDEL);
            continue;
        }

        ASSERT(status == MDDB_OK);

        mddb_setrecprivate(recid, MD_PRV_GOTIT);
    }
}

static void
md_print_block_usage(mddb_set_t *s, uint_t blks)
{
    uint_t      ib;
    int     li;
    mddb_mb_ic_t    *mbip;
    uint_t      max_blk_needed;
    mddb_lb_t   *lbp;
    mddb_sidelocator_t  *slp;
    int     drv_index;
    md_splitname    sn;
    char        *name;
    char        *suffix;
    size_t      prefixlen;
    size_t      suffixlen;
    int     alloc_sz;


    max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;

    cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
        "            Additional Blocks Needed:            %d\n\n"
        "            Increase size of following replicas for\n"
        "            device relocatability by deleting listed\n"
        "            replica and re-adding replica with\n"
        "            increased size (see metadb(1M)):\n"
        "                Replica                   Increase By",
        s->s_totalblkcnt, (blks - s->s_freeblkcnt));

    lbp = s->s_lbp;

    for (li = 0; li < lbp->lb_loccnt; li++) {
        if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
            continue;
        ib = 0;
        for (mbip = s->s_mbiarray[li]; mbip != NULL;
            mbip = mbip->mbi_next) {
            ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
        }
        if (ib == 0)
            continue;
        if (ib < max_blk_needed) {
            slp = &lbp->lb_sidelocators[s->s_sideno][li];
            drv_index = slp->l_drvnm_index;
            mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
                &sn);
            prefixlen = SPN_PREFIX(&sn).pre_len;
            suffixlen = SPN_SUFFIX(&sn).suf_len;
            alloc_sz = (int)(prefixlen + suffixlen + 2);
            name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
            (void) strncpy(name, SPN_PREFIX(&sn).pre_data,
                prefixlen);
            name[prefixlen] = '/';
            suffix = name + (prefixlen + 1);
            (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
                suffixlen);
            name[prefixlen + suffixlen + 1] = '\0';
            cmn_err(CE_WARN,
                "  %s (%s:%d:%d)   %d blocks",
                name, lbp->lb_drvnm[drv_index].dn_data,
                slp->l_mnum, lbp->lb_locators[li].l_blkno,
                (max_blk_needed - ib));
            kmem_free(name, alloc_sz);
        }
    }
}

/*
 * md_create_minor_node:
 *  Create the minor device for the given set and un_self_id.
 *
 * Input:
 *  setno   - set number
 *  mnum    - selfID of unit
 *
 * Output:
 *  None.
 *
 * Returns 0 for success, 1 for failure.
 *
 * Side-effects:
 *  None.
 */
int
md_create_minor_node(set_t setno, minor_t mnum)
{
    char        name[20];

    /* Check for valid arguments */
    if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
        return (1);

    (void) snprintf(name, 20, "%u,%u,blk",
        (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));

    if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
        MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
        return (1);

    (void) snprintf(name, 20, "%u,%u,raw",
        (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));

    if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
        MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
        return (1);

    return (0);
}

/*
 * For a given key check if it is an orphaned record.
 * The following conditions are used to determine an orphan.
 * 1. The device associated with that key is not a metadevice.
 * 2. If DEVID_STYLE then the physical device does not have a device Id
 * associated with it.
 *
 * If a key does not have an entry in the devid namespace it could be
 * a device that does not support device ids. Hence the record is not
 * deleted.
 */

static int
md_verify_orphaned_record(set_t setno, mdkey_t key)
{
    md_dev64_t  odev; /* orphaned dev */
    mddb_set_t  *s;
    side_t      side = 0;
    struct nm_next_hdr  *did_nh = NULL;

    s = (mddb_set_t *)md_set[setno].s_db;
    if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
        == NULL)
        return (0);
    /*
     * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
     */
    if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
        odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
        if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
            return (0);
        if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
            NULL)
            return (1);
    }
    return (0);
}

int
md_snarf_db_set(set_t setno, md_error_t *ep)
{
    int         err = 0;
    int         i;
    mddb_recid_t        recid;
    mddb_type_t     drvrid;
    mddb_recstatus_t    status;
    md_ops_t        *ops;
    uint_t          privat;
    mddb_set_t      *s;
    uint_t          cvt_blks;
    struct nm_next_hdr  *nh;
    mdkey_t         key = MD_KEYWILD;
    side_t          side = 0;
    int         size;
    int         devid_flag;
    int         retval;
    uint_t          un;
    int         un_next_set = 0;

    md_haltsnarf_enter(setno);

    mutex_enter(&md_mx);
    if (md_set[setno].s_status & MD_SET_SNARFED) {
        mutex_exit(&md_mx);
        md_haltsnarf_exit(setno);
        return (0);
    }
    mutex_exit(&md_mx);

    if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
        if (md_start_daemons(TRUE)) {
            if (ep != NULL)
                (void) mdsyserror(ep, ENXIO);
            err = -1;
            goto out;
        }
    }


    /*
     * Load the devid name space if it exists
     */
    (void) md_load_namespace(setno, NULL, NM_DEVID);
    if (!md_load_namespace(setno, ep, 0L)) {
        /*
         * Unload the devid namespace
         */
        (void) md_unload_namespace(setno, NM_DEVID);
        err = -1;
        goto out;
    }

    /*
     * If replica is in non-devid state, convert if:
     *  - not in probe during upgrade (md_keep_repl_state = 0)
     *  - enough space available in replica
     *  - local set
     *  - not a multi-node diskset
     *  - clustering is not present (for non-local set)
     */
    s = (mddb_set_t *)md_set[setno].s_db;
    devid_flag = 0;
    if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
        devid_flag = 1;
    if (cluster_bootflags & CLUSTER_CONFIGURED)
        if (setno != MD_LOCAL_SET)
            devid_flag = 0;
    if (MD_MNSET_SETNO(setno))
        devid_flag = 0;
    if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
        devid_flag = 0;

    /*
     * if we weren't devid style before and md_keep_repl_state=1
     * we need to stay non-devid
     */
    if ((md_keep_repl_state == 1) &&
        ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
        devid_flag = 0;
    if (devid_flag) {
        /*
         * Determine number of free blocks needed to convert
         * entire replica to device id format - locator blocks
         * and namespace.
         */
        cvt_blks = 0;
        if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
            if (ep != NULL)
                (void) mdsyserror(ep, EIO);
            err = -1;
            goto out;

        }
        cvt_blks += md_nm_did_chkspace(setno);

        /* add MDDB_DEVID_CONV_PERC% */
        if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
            cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
        }

        if (cvt_blks <= s->s_freeblkcnt) {
            if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
                if (ep != NULL)
                    (void) mdsyserror(ep, EIO);
                err = -1;
                goto out;
            }

        } else {
            /*
             * Print message that replica can't be converted for
             * lack of space.   No failure - just continue to
             * run without device ids.
             */
            cmn_err(CE_WARN,
                "Unable to add Solaris Volume Manager device "
                "relocation data.\n"
                "          To use device relocation feature:\n"
                "          - Increase size of listed replicas\n"
                "          - Reboot");
            md_print_block_usage(s, cvt_blks);
            cmn_err(CE_WARN,
                "Loading set without device relocation data.\n"
                "          Solaris Volume Manager disk movement "
                "not tracked in local set.");
        }
    }

    /*
     * go through and load any modules referenced in
     * data base
     */
    recid = mddb_makerecid(setno, 0);
    while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
        status = mddb_getrecstatus(recid);
        if (status == MDDB_STALE) {
            if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
                md_set_setstatus(setno, MD_SET_STALE);
                cmn_err(CE_WARN,
                    "md: state database is stale");
            }
        } else if (status == MDDB_NODATA) {
            mddb_setrecprivate(recid, MD_PRV_PENDDEL);
            continue;
        }
        drvrid = mddb_getrectype1(recid);
        if (drvrid < MDDB_FIRST_MODID)
            continue;
        if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
            drvrid) < 0) {
            cmn_err(CE_NOTE, "md: could not load misc/%s",
                md_getshared_name(setno, drvrid));
        }
    }

    if (recid < 0)
        goto out;

    snarf_user_data(setno);

    /*
     * Initialize the md_nm_snarfed array
     * this array is indexed by the key and
     * is set by md_getdevnum during the snarf time
     */
    if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
        size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
            r_next_key) * (sizeof (int)));
        md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
    }

    /*
     * go through and snarf until nothing gets added
     */
    do {
        i = 0;
        for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
            if (ops->md_snarf != NULL) {
                retval = ops->md_snarf(MD_SNARF_DOIT, setno);
                if (retval == -1) {
                    err = -1;
                    /* Don't know the failed unit */
                    (void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
                        0);
                    (void) md_halt_set(setno, MD_HALT_ALL);
                    (void) mddb_unload_set(setno);
                    md_haltsnarf_exit(setno);
                    return (err);
                } else {
                    i += retval;
                }
            }
        }
    } while (i);

    /*
     * Set the first available slot and availability
     */
    md_set[setno].s_un_avail = 0;
    for (un = 0; un < MD_MAXUNITS; un++) {
        if (md_set[setno].s_un[un] != NULL) {
            continue;
        } else {
            if (!un_next_set) {
                md_set[setno].s_un_next = un;
                un_next_set = 1;
            }
            md_set[setno].s_un_avail++;
        }
    }

    md_set_setstatus(setno, MD_SET_SNARFED);

    recid = mddb_makerecid(setno, 0);
    while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
        privat = mddb_getrecprivate(recid);
        if (privat & MD_PRV_COMMIT) {
            if (mddb_commitrec(recid)) {
                if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
                    md_set_setstatus(setno, MD_SET_STALE);
                    cmn_err(CE_WARN,
                        "md: state database is stale");
                }
            }
            mddb_setrecprivate(recid, MD_PRV_GOTIT);
        }
    }

    /* Deletes must happen after all the commits */
    recid = mddb_makerecid(setno, 0);
    while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
        privat = mddb_getrecprivate(recid);
        if (privat & MD_PRV_DELETE) {
            if (mddb_deleterec(recid)) {
                if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
                    md_set_setstatus(setno, MD_SET_STALE);
                    cmn_err(CE_WARN,
                        "md: state database is stale");
                }
                mddb_setrecprivate(recid, MD_PRV_GOTIT);
            }
            recid = mddb_makerecid(setno, 0);
        }
    }

    /*
     * go through and clean up records until nothing gets cleaned up.
     */
    do {
        i = 0;
        for (ops = md_opslist; ops != NULL; ops = ops->md_next)
            if (ops->md_snarf != NULL)
                i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
    } while (i);

    if (md_nm_snarfed != NULL &&
        !(md_get_setstatus(setno) & MD_SET_STALE)) {
        /*
         * go thru and cleanup the namespace and the device id
         * name space
         */
        for (key = 1;
            key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
            key++) {
            /*
             * Is the entry an 'orphan'?
             */
            if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
                NULL) {
                /*
                 * If the value is not set then apparently
                 * it is not part of the current configuration,
                 * remove it this can happen when system panic
                 * between the primary name space update and
                 * the device id name space update
                 */
                if (md_nm_snarfed[key] == 0) {
                    if (md_verify_orphaned_record(setno,
                        key) == 1)
                        (void) remove_entry(nh,
                            side, key, 0L);
                }
            }
        }
    }

    if (md_nm_snarfed != NULL) {
        /*
         * Done and free the memory
         */
        kmem_free(md_nm_snarfed, size);
        md_nm_snarfed = NULL;
    }

    if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
        !(md_get_setstatus(setno) & MD_SET_STALE)) {
        /*
         * if the destroy flag has been set and
         * the MD_SET_DIDCLUP bit is not set in
         * the set's status field, cleanup the
         * entire device id namespace
         */
        if (md_devid_destroy &&
            !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
            (void) md_devid_cleanup(setno, 1);
            md_set_setstatus(setno, MD_SET_DIDCLUP);
        } else
            (void) md_devid_cleanup(setno, 0);
    }

    /*
     * clear single threading on snarf, return success or error
     */
out:
    md_haltsnarf_exit(setno);
    return (err);
}

void
get_minfo(struct dk_minfo *info, minor_t mnum)
{
    md_unit_t   *un;
    mdi_unit_t  *ui;

    info->dki_capacity = 0;
    info->dki_lbsize = 0;
    info->dki_media_type = 0;

    if ((ui = MDI_UNIT(mnum)) == NULL) {
        return;
    }
    un = (md_unit_t *)md_unit_readerlock(ui);
    info->dki_capacity = un->c.un_total_blocks;
    md_unit_readerexit(ui);
    info->dki_lbsize = DEV_BSIZE;
    info->dki_media_type = DK_UNKNOWN;
}


void
get_info(struct dk_cinfo *info, minor_t mnum)
{
    /*
     * Controller Information
     */
    info->dki_ctype = DKC_MD;
    info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
    (void) strcpy(info->dki_cname,
        ddi_get_name(ddi_get_parent(md_devinfo)));
    /*
     * Unit Information
     */
    info->dki_unit = mnum;
    info->dki_slave = 0;
    (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
    info->dki_flags = 0;
    info->dki_partition = 0;
    info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);

    /*
     * We can't get from here to there yet
     */
    info->dki_addr = 0;
    info->dki_space = 0;
    info->dki_prio = 0;
    info->dki_vec = 0;
}

/*
 * open admin device
 */
static int
mdadminopen(
    int flag,
    int otyp)
{
    int err = 0;

    /* single thread */
    mutex_enter(&md_mx);

    /* check type and flags */
    if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
        err = EINVAL;
        goto out;
    }
    if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
        (md_status & MD_GBL_EXCL)) {
        err = EBUSY;
        goto out;
    }

    /* count and flag open */
    md_ocnt[otyp]++;
    md_status |= MD_GBL_OPEN;
    if (flag & FEXCL)
        md_status |= MD_GBL_EXCL;

    /* unlock return success */
out:
    mutex_exit(&md_mx);
    return (err);
}

/*
 * open entry point
 */
static int
mdopen(
    dev_t       *dev,
    int     flag,
    int     otyp,
    cred_t      *cred_p)
{
    minor_t     mnum = getminor(*dev);
    unit_t      unit = MD_MIN2UNIT(mnum);
    set_t       setno = MD_MIN2SET(mnum);
    mdi_unit_t  *ui = NULL;
    int     err = 0;
    md_parent_t parent;

    /* dispatch admin device opens */
    if (mnum == MD_ADM_MINOR)
        return (mdadminopen(flag, otyp));

    /* lock, check status */
    rw_enter(&md_unit_array_rw.lock, RW_READER);

tryagain:
    if (md_get_status() & MD_GBL_HALTED)  {
        err = ENODEV;
        goto out;
    }

    /* check minor */
    if ((setno >= md_nsets) || (unit >= md_nunits)) {
        err = ENXIO;
        goto out;
    }

    /* make sure we're snarfed */
    if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
        if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
            err = ENODEV;
            goto out;
        }
    }
    if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
        err = ENODEV;
        goto out;
    }

    /* check unit */
    if ((ui = MDI_UNIT(mnum)) == NULL) {
        err = ENXIO;
        goto out;
    }

    /*
     * The softpart open routine may do an I/O during the open, in
     * which case the open routine will set the OPENINPROGRESS flag
     * and drop all locks during the I/O.  If this thread sees
     * the OPENINPROGRESS flag set, if should wait until the flag
     * is reset before calling the driver's open routine.  It must
     * also revalidate the world after it grabs the unit_array lock
     * since the set may have been released or the metadevice cleared
     * during the sleep.
     */
    if (MD_MNSET_SETNO(setno)) {
        mutex_enter(&ui->ui_mx);
        if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
            rw_exit(&md_unit_array_rw.lock);
            cv_wait(&ui->ui_cv, &ui->ui_mx);
            rw_enter(&md_unit_array_rw.lock, RW_READER);
            mutex_exit(&ui->ui_mx);
            goto tryagain;
        }
        mutex_exit(&ui->ui_mx);
    }

    /* Test if device is openable */
    if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
        err = ENXIO;
        goto out;
    }

    /* don't allow opens w/WRITE flag if stale */
    if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
        err = EROFS;
        goto out;
    }

    /* don't allow writes to subdevices */
    parent = md_get_parent(md_expldev(*dev));
    if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
        err = EROFS;
        goto out;
    }

    /* open underlying driver */
    if (md_ops[ui->ui_opsindex]->md_open != NULL) {
        if ((err = (*md_ops[ui->ui_opsindex]->md_open)
            (dev, flag, otyp, cred_p, 0)) != 0)
            goto out;
    }

    /* or do it ourselves */
    else {
        /* single thread */
        (void) md_unit_openclose_enter(ui);
        err = md_unit_incopen(mnum, flag, otyp);
        md_unit_openclose_exit(ui);
        if (err != 0)
            goto out;
    }

    /* unlock, return status */
out:
    rw_exit(&md_unit_array_rw.lock);
    return (err);
}

/*
 * close admin device
 */
static int
mdadminclose(
    int otyp)
{
    int i;
    int err = 0;

    /* single thread */
    mutex_enter(&md_mx);

    /* check type and flags */
    if ((otyp < 0) || (otyp >= OTYPCNT)) {
        err = EINVAL;
        goto out;
    } else if (md_ocnt[otyp] == 0) {
        err = ENXIO;
        goto out;
    }

    /* count and flag closed */
    if (otyp == OTYP_LYR)
        md_ocnt[otyp]--;
    else
        md_ocnt[otyp] = 0;
    md_status &= ~MD_GBL_OPEN;
    for (i = 0; (i < OTYPCNT); ++i)
        if (md_ocnt[i] != 0)
            md_status |= MD_GBL_OPEN;
    if (! (md_status & MD_GBL_OPEN))
        md_status &= ~MD_GBL_EXCL;

    /* unlock return success */
out:
    mutex_exit(&md_mx);
    return (err);
}

/*
 * close entry point
 */
static int
mdclose(
    dev_t       dev,
    int     flag,
    int     otyp,
    cred_t      *cred_p)
{
    minor_t     mnum = getminor(dev);
    set_t       setno = MD_MIN2SET(mnum);
    unit_t      unit = MD_MIN2UNIT(mnum);
    mdi_unit_t  *ui = NULL;
    int     err = 0;

    /* dispatch admin device closes */
    if (mnum == MD_ADM_MINOR)
        return (mdadminclose(otyp));

    /* check minor */
    if ((setno >= md_nsets) || (unit >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL)) {
        err = ENXIO;
        goto out;
    }

    /* close underlying driver */
    if (md_ops[ui->ui_opsindex]->md_close != NULL) {
        if ((err = (*md_ops[ui->ui_opsindex]->md_close)
            (dev, flag, otyp, cred_p, 0)) != 0)
            goto out;
    }

    /* or do it ourselves */
    else {
        /* single thread */
        (void) md_unit_openclose_enter(ui);
        err = md_unit_decopen(mnum, otyp);
        md_unit_openclose_exit(ui);
        if (err != 0)
            goto out;
    }

    /* return success */
out:
    return (err);
}


/*
 * This routine performs raw read operations.  It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *uio struct which is passed to us may
 * specify a read which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This will
 * be handled by mdstrategy.
 */
/*ARGSUSED*/
static int
mdread(dev_t dev, struct uio *uio, cred_t *credp)
{
    minor_t     mnum;
    mdi_unit_t  *ui;
    int     error;

    if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
        (MD_MIN2SET(mnum) >= md_nsets) ||
        (MD_MIN2UNIT(mnum) >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL))
        return (ENXIO);

    if (md_ops[ui->ui_opsindex]->md_read  != NULL)
        return ((*md_ops[ui->ui_opsindex]->md_read)
            (dev, uio, credp));

    if ((error = md_chk_uio(uio)) != 0)
        return (error);

    return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
}

/*
 * This routine performs async raw read operations.  It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *aio struct which is passed to us may
 * specify a read which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This will
 * be handled by mdstrategy.
 */
/*ARGSUSED*/
static int
mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
{
    minor_t     mnum;
    mdi_unit_t  *ui;
    int     error;


    if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
        (MD_MIN2SET(mnum) >= md_nsets) ||
        (MD_MIN2UNIT(mnum) >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL))
        return (ENXIO);

    if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
        return ((*md_ops[ui->ui_opsindex]->md_aread)
            (dev, aio, credp));

    if ((error = md_chk_uio(aio->aio_uio)) != 0)
        return (error);

    return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
}

/*
 * This routine performs raw write operations.  It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *uio struct which is passed to us may
 * specify a write which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This is
 * handled by mdstrategy.
 *
 */
/*ARGSUSED*/
static int
mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
{
    minor_t     mnum;
    mdi_unit_t  *ui;
    int     error;

    if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
        (MD_MIN2SET(mnum) >= md_nsets) ||
        (MD_MIN2UNIT(mnum) >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL))
        return (ENXIO);

    if (md_ops[ui->ui_opsindex]->md_write  != NULL)
        return ((*md_ops[ui->ui_opsindex]->md_write)
            (dev, uio, credp));

    if ((error = md_chk_uio(uio)) != 0)
        return (error);

    return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
}

/*
 * This routine performs async raw write operations.  It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *aio struct which is passed to us may
 * specify a write which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This is
 * handled by mdstrategy.
 *
 */
/*ARGSUSED*/
static int
mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
{
    minor_t     mnum;
    mdi_unit_t  *ui;
    int     error;


    if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
        (MD_MIN2SET(mnum) >= md_nsets) ||
        (MD_MIN2UNIT(mnum) >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL))
        return (ENXIO);

    if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
        return ((*md_ops[ui->ui_opsindex]->md_awrite)
            (dev, aio, credp));

    if ((error = md_chk_uio(aio->aio_uio)) != 0)
        return (error);

    return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
}

int
mdstrategy(struct buf *bp)
{
    minor_t     mnum;
    mdi_unit_t  *ui;

    ASSERT((bp->b_flags & B_DONE) == 0);

    if (panicstr)
        md_clr_status(MD_GBL_DAEMONS_LIVE);

    if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
        (MD_MIN2SET(mnum) >= md_nsets) ||
        (MD_MIN2UNIT(mnum) >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL)) {
        bp->b_flags |= B_ERROR;
        bp->b_error = ENXIO;
        bp->b_resid = bp->b_bcount;
        biodone(bp);
        return (0);
    }

    bp->b_flags &= ~(B_ERROR | B_DONE);
    if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
        (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
    } else {
        (void) errdone(ui, bp, ENXIO);
    }
    return (0);
}

/*
 * Return true if the ioctl is allowed to be multithreaded.
 * All the ioctls with MN are sent only from the message handlers through
 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
 * ioctl for the same metadevice are issued at the same time.
 * So we are safe here.
 * The other ioctls do not mess with any metadevice structures and therefor
 * are harmless too, if called multiple times at the same time.
 */
static boolean_t
is_mt_ioctl(int cmd) {

    switch (cmd) {
    case MD_IOCGUNIQMSGID:
    case MD_IOCGVERSION:
    case MD_IOCISOPEN:
    case MD_MN_SET_MM_OWNER:
    case MD_MN_SET_STATE:
    case MD_MN_SUSPEND_WRITES:
    case MD_MN_ALLOCATE_HOTSPARE:
    case MD_MN_SET_SETFLAGS:
    case MD_MN_GET_SETFLAGS:
    case MD_MN_MDDB_OPTRECFIX:
    case MD_MN_MDDB_PARSE:
    case MD_MN_MDDB_BLOCK:
    case MD_MN_DB_USERREQ:
    case MD_IOC_SPSTATUS:
    case MD_MN_COMMD_ERR:
    case MD_MN_SET_COMMD_RUNNING:
    case MD_MN_RESYNC:
    case MD_MN_SETSYNC:
    case MD_MN_POKE_HOTSPARES:
        return (1);
    default:
        return (0);
    }
}

/*
 * This routine implements the ioctl calls for the Virtual Disk System.
 * It is called from the device switch at normal priority.
 */
/* ARGSUSED */
static int
mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
    int *rval_p)
{
    minor_t     mnum = getminor(dev);
    mdi_unit_t  *ui;
    IOLOCK      lock;
    int     err;

    /*
     * For multinode disksets  number of ioctls are allowed to be
     * multithreaded.
     * A fundamental assumption made in this implementation is that
     * ioctls either do not interact with other md structures  or the
     * ioctl to the admin device can only occur if the metadevice
     * device is open. i.e. avoid a race between metaclear and the
     * progress of a multithreaded ioctl.
     */

    if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
        return (EINTR);
    }

    /*
     * initialize lock tracker
     */
    IOLOCK_INIT(&lock);

    /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */

    if (is_mt_ioctl(cmd)) {
        /* increment the md_mtioctl_cnt */
        mutex_enter(&md_mx);
        md_mtioctl_cnt++;
        mutex_exit(&md_mx);
        lock.l_flags |= MD_MT_IOCTL;
    }

    /*
     * this has been added to prevent notification from re-snarfing
     * so metaunload will work.  It may interfere with other modules
     * halt process.
     */
    if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
        return (IOLOCK_RETURN(ENXIO, &lock));

    /*
     * admin device ioctls
     */
    if (mnum == MD_ADM_MINOR) {
        err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
            mode, &lock);
    }

    /*
     * metadevice ioctls
     */
    else if ((MD_MIN2SET(mnum) >= md_nsets) ||
        (MD_MIN2UNIT(mnum) >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL)) {
        err = ENXIO;
    } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
        err = ENOTTY;
    } else {
        err = (*md_ops[ui->ui_opsindex]->md_ioctl)
            (dev, cmd, (void *) data, mode, &lock);
    }

    /*
     * drop any locks we grabbed
     */
    return (IOLOCK_RETURN_IOCTLEND(err, &lock));
}

static int
mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
{
    minor_t     mnum;
    set_t       setno;
    mdi_unit_t  *ui;

    if ((mnum = getminor(dev)) == MD_ADM_MINOR)
        return (ENXIO);

    setno = MD_MIN2SET(mnum);

    if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
        ((ui = MDI_UNIT(mnum)) == NULL))
        return (ENXIO);


    if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
        return (ENXIO);

    if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
        return ((*md_ops[ui->ui_opsindex]->md_dump)
            (dev, addr, blkno, nblk));

    return (ENXIO);
}

/*
 * Metadevice unit number dispatcher
 * When this routine is called it will scan the
 * incore unit array and return the avail slot
 * hence the unit number to the caller
 *
 * Return -1 if there is nothing available
 */
unit_t
md_get_nextunit(set_t setno)
{
    unit_t  un, start;

    /*
     * If nothing available
     */
    if (md_set[setno].s_un_avail == 0) {
        return (MD_UNITBAD);
    }

    mutex_enter(&md_mx);
    start = un = md_set[setno].s_un_next;

    /* LINTED: E_CONSTANT_CONDITION */
    while (1) {
        if (md_set[setno].s_un[un] == NULL) {
            /*
             * Advance the starting index for the next
             * md_get_nextunit call
             */
            if (un == MD_MAXUNITS - 1) {
                md_set[setno].s_un_next = 0;
            } else {
                md_set[setno].s_un_next = un + 1;
            }
            break;
        }

        un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);

        if (un == start) {
            un = MD_UNITBAD;
            break;
        }

    }

    mutex_exit(&md_mx);
    return (un);
}