lvm/md/md_mddb.c

	md_mddb.c revision 32c22d57860198538fb6b8f261cb76ab26318d34
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/time.h>
#include <sys/uio.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/systeminfo.h>
#include <sys/sysmacros.h>
#include <sys/buf.h>
#include <sys/kmem.h>
#include <sys/file.h>
#include <sys/open.h>
#include <sys/debug.h>
#include <sys/stat.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_crc.h>
#include <sys/lvm/md_convert.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/lvm/mdmn_commd.h>
#include <sys/cladm.h>

mhd_mhiargs_t   defmhiargs = {
    1000,
    { 6000, 6000, 30000 }
};

#define MDDB

#include <sys/lvm/mdvar.h>
#include <sys/lvm/mdmed.h>
#include <sys/lvm/md_names.h>
#include <sys/cred.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/esunddi.h>

#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>

extern char svm_bootpath[];

int         md_maxbootlist = MAXBOOTLIST;
static ulong_t      mddb_maxblocks = 0; /* tune for small records */
static int      mddb_maxbufheaders = 50;
static uint_t       mddb_maxcopies = MDDB_NLB;

/*
 * If this is set, more detailed messages about DB init will be given, instead
 * of just the MDE_DB_NODB.
 */
static int      mddb_db_err_detail = 0;

/*
 * This lock is used to single-thread load/unload of all sets
 */
static kmutex_t     mddb_lock;

/*
 * You really do NOT want to change this boolean.
 * It can be VERY dangerous to do so.  Loss of
 * data may occur. USE AT YOUR OWN RISK!!!!
 */
static int      mddb_allow_half = 0;
/*
 * For mirrored root allow reboot with only half the replicas available
 * Flag inserted for Santa Fe project.
 */
int mirrored_root_flag;

#define ISWHITE(c)  (((c) == ' ') || ((c) == '\t') || \
                ((c) == '\r') || ((c) == '\n'))
#define ISNUM(c)    (((c) >= '0') && ((c) <= '9'))

#define SETMUTEX(setno) (&md_set[setno].s_dbmx)

extern md_krwlock_t md_unit_array_rw;   /* md.c */
extern set_t        md_nsets;       /* md.c */
extern int      md_nmedh;       /* md.c */
extern md_set_t     md_set[];       /* md.c */
extern int      (*mdv_strategy_tstpnt)(buf_t *, int, void*);
extern dev_info_t   *md_devinfo;
extern int      md_init_debug;
extern int      md_status;
extern md_ops_t     *md_opslist;
extern md_krwlock_t nm_lock;

static int      update_locatorblock(mddb_set_t *s, md_dev64_t dev,
                ddi_devid_t didptr, ddi_devid_t old_didptr);

/*
 * Defines for crc calculation for records
 * rec_crcgen generates a crc checksum for a record block
 * rec_crcchk checks the crc checksum for a record block
 */
#define REC_CRCGEN  0
#define REC_CRCCHK  1
#define rec_crcgen(s, dep, rbp) \
    (void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
#define rec_crcchk(s, dep, rbp) \
    rec_crcfunc(s, dep, rbp, REC_CRCCHK)

/*
 * During upgrade, SVM basically runs with the devt from the target
 * being upgraded.  Translations are made from the target devt to the
 * miniroot devt when writing data out to the disk.  This is done by
 * the following routines:
 *  wrtblklst
 *  writeblks
 *  readblklst
 *  readblks
 *  dt_read
 *
 * The following routines are used by the routines listed above and
 * expect a translated (aka miniroot) devt:
 *  getblks
 *  getmasters
 *
 * Also, when calling any system routines, such as ddi_lyr_get_devid,
 * the translated (aka miniroot) devt must be used.
 *
 * By the same token, the major number and major name conversion operations
 * need to use the name_to_major file from the target system instead
 * of the name_to_major file on the miniroot.  So, calls to
 * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
 * when running on an upgrade.  Same is true with calls to
 * ddi_major_to_name.
 */


#ifndef MDDB_FAKE

static int
mddb_rwdata(
    mddb_set_t  *s, /* incore db set structure */
    int     flag,   /* B_ASYNC, B_FAILFAST or 0 passed in here */
    buf_t       *bp
)
{
    int     err = 0;

    bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);

    mutex_exit(SETMUTEX(s->s_setno));
    if (mdv_strategy_tstpnt == NULL ||
        (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
        (void) bdev_strategy(bp);

    if (flag & B_ASYNC) {
        mutex_enter(SETMUTEX(s->s_setno));
        return (0);
    }

    err = biowait(bp);
    mutex_enter(SETMUTEX(s->s_setno));
    return (err);
}

static void
setidentifier(
    mddb_set_t  *s,
    identifier_t    *ident
)
{
    if (s->s_setno == MD_LOCAL_SET)
        (void) strcpy(&ident->serial[0], s->s_ident.serial);
    else
        ident->createtime = s->s_ident.createtime;
}

static int
cmpidentifier(
    mddb_set_t  *s,
    identifier_t    *ident
)
{
    if (s->s_setno == MD_LOCAL_SET)
        return (strcmp(ident->serial, s->s_ident.serial));
    else
        return (timercmp(&ident->createtime,
            /*CSTYLED*/
            &s->s_ident.createtime, !=));
}

static int
mddb_devopen(
    md_dev64_t  dev
)
{
    dev_t       ddi_dev = md_dev64_to_dev(dev);

    if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
        return (0);
    return (1);
}

static void
mddb_devclose(
    md_dev64_t  dev
)
{
    (void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
}

/*
 * stripe_skip_ts
 *
 * Returns a list of fields to be skipped in the stripe record structure.
 * These fields are ms_timestamp in the component structure.
 * Used to skip these fields when calculating the checksum.
 */
static crc_skip_t *
stripe_skip_ts(void *un, uint_t revision)
{
    struct ms_row32_od  *small_mdr;
    struct ms_row       *big_mdr;
    uint_t          row, comp, ncomps, compoff;
    crc_skip_t      *skip;
    crc_skip_t      *skip_prev;
    crc_skip_t      skip_start = {0, 0, 0};
    ms_unit_t       *big_un;
    ms_unit32_od_t      *small_un;
    uint_t          rb_off = offsetof(mddb_rb32_t, rb_data[0]);

    switch (revision) {
    case MDDB_REV_RB:
    case MDDB_REV_RBFN:
        small_un = (ms_unit32_od_t *)un;
        skip_prev = &skip_start;

        if (small_un->un_nrows == 0)
            return (NULL);
        /*
         * walk through all rows to find the total number
         * of components
         */
        small_mdr   = &small_un->un_row[0];
        ncomps = 0;
        for (row = 0; (row < small_un->un_nrows); row++) {
            ncomps += small_mdr[row].un_ncomp;
        }

        /* Now walk through the components */
        compoff = small_un->un_ocomp + rb_off;
        for (comp = 0; (comp < ncomps); ++comp) {
            uint_t  mdcp = compoff +
                (comp * sizeof (ms_comp32_od_t));
            skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
                KM_SLEEP);
            skip->skip_offset = mdcp +
                offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
            skip->skip_size = sizeof (md_timeval32_t);
            skip_prev->skip_next = skip;
            skip_prev = skip;
        }
        break;
    case MDDB_REV_RB64:
    case MDDB_REV_RB64FN:
        big_un = (ms_unit_t *)un;
        skip_prev = &skip_start;

        if (big_un->un_nrows == 0)
            return (NULL);
        /*
         * walk through all rows to find the total number
         * of components
         */
        big_mdr   = &big_un->un_row[0];
        ncomps = 0;
        for (row = 0; (row < big_un->un_nrows); row++) {
            ncomps += big_mdr[row].un_ncomp;
        }

        /* Now walk through the components */
        compoff = big_un->un_ocomp + rb_off;
        for (comp = 0; (comp < ncomps); ++comp) {
            uint_t  mdcp = compoff +
                (comp * sizeof (ms_comp_t));
            skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
                KM_SLEEP);
            skip->skip_offset = mdcp +
                offsetof(ms_comp_t, un_mirror.ms_timestamp);
            skip->skip_size = sizeof (md_timeval32_t);
            skip_prev->skip_next = skip;
            skip_prev = skip;
        }
        break;
    }
    /* Return the start of the list of fields to skip */
    return (skip_start.skip_next);
}

/*
 * mirror_skip_ts
 *
 * Returns a list of fields to be skipped in the mirror record structure.
 * This includes un_last_read and sm_timestamp for each submirror
 * Used to skip these fields when calculating the checksum.
 */
static crc_skip_t *
mirror_skip_ts(uint_t revision)
{
    int     i;
    crc_skip_t  *skip;
    crc_skip_t  *skip_prev;
    crc_skip_t  skip_start = {0, 0, 0};
    uint_t      rb_off = offsetof(mddb_rb32_t, rb_data[0]);

    skip_prev = &skip_start;

    skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
    switch (revision) {
    case MDDB_REV_RB:
    case MDDB_REV_RBFN:
        skip->skip_offset = offsetof(mm_unit32_od_t,
            un_last_read) + rb_off;
        break;
    case MDDB_REV_RB64:
    case MDDB_REV_RB64FN:
        skip->skip_offset = offsetof(mm_unit_t,
            un_last_read) + rb_off;
        break;
    }
    skip->skip_size = sizeof (int);
    skip_prev->skip_next = skip;
    skip_prev = skip;

    for (i = 0; i < NMIRROR; i++) {
        skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
        switch (revision) {
        case MDDB_REV_RB:
        case MDDB_REV_RBFN:
            skip->skip_offset = offsetof(mm_unit32_od_t,
                un_sm[i].sm_timestamp) + rb_off;
            break;
        case MDDB_REV_RB64:
        case MDDB_REV_RB64FN:
            skip->skip_offset = offsetof(mm_unit_t,
                un_sm[i].sm_timestamp) + rb_off;
            break;
        }
        skip->skip_size = sizeof (md_timeval32_t);
        skip_prev->skip_next = skip;
        skip_prev = skip;
    }
    /* Return the start of the list of fields to skip */
    return (skip_start.skip_next);
}

/*
 * hotspare_skip_ts
 *
 * Returns a list of the timestamp fields in the hotspare record structure.
 * Used to skip these fields when calculating the checksum.
 */
static crc_skip_t *
hotspare_skip_ts(uint_t revision)
{
    crc_skip_t  *skip;
    uint_t      rb_off = offsetof(mddb_rb32_t, rb_data[0]);

    skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
    switch (revision) {
    case MDDB_REV_RB:
    case MDDB_REV_RBFN:
        skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
            rb_off;
        break;
    case MDDB_REV_RB64:
    case MDDB_REV_RB64FN:
        skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
            rb_off;
        break;
    }
    skip->skip_size = sizeof (md_timeval32_t);
    return (skip);
}

/*
 * rec_crcfunc
 *
 * Calculate or check the checksum for a record
 * Calculate the crc if check == 0, Check the crc if check == 1
 *
 * Record block may be written by different nodes in a multi-owner diskset
 * (in case of master change), the function rec_crcchk excludes timestamp
 * fields in crc computation of record data.
 * Otherwise, timestamp fields will cause each node to have a different
 * checksum for same record block causing the exclusive-or of all record block
 * checksums and data block record sums to be non-zero after new master writes
 * at least one record block.
 */
static uint_t
rec_crcfunc(
    mddb_set_t  *s,
    mddb_de_ic_t    *dep,
    mddb_rb32_t *rbp,
    int     check
)
{
    crc_skip_t  *skip;
    crc_skip_t  *skip_tail;
    mddb_type_t type = dep->de_type1;
    uint_t      ret;

    /*
     * Generate a list of the areas to be skipped when calculating
     * the checksum.
     * First skip rb_checksum, rb_private and rb_userdata.
     */
    skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
    skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
    skip->skip_size = 3 * sizeof (uint_t);
    skip_tail = skip;
    if (MD_MNSET_SETNO(s->s_setno)) {
        /* For a MN set, skip rb_timestamp */
        skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
            KM_SLEEP);
        skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
        skip_tail->skip_size = sizeof (md_timeval32_t);
        skip->skip_next = skip_tail;

        /* Now add a list of timestamps to be skipped */
        if (type >= MDDB_FIRST_MODID) {
            switch (dep->de_flags) {
                case MDDB_F_STRIPE:
                    skip_tail->skip_next =
                        stripe_skip_ts((void *)rbp->rb_data,
                        rbp->rb_revision);
                    break;
                case MDDB_F_MIRROR:
                    skip_tail->skip_next =
                        mirror_skip_ts(rbp->rb_revision);
                    break;
                case MDDB_F_HOTSPARE:
                    skip_tail->skip_next =
                        hotspare_skip_ts(rbp->rb_revision);
                    break;
                default:
                    break;
            }
        }
    }

    if (check) {
        ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
    } else {
        crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
        ret = rbp->rb_checksum;
    }
    while (skip) {
        crc_skip_t  *skip_save = skip;

        skip = skip->skip_next;
        kmem_free(skip_save, sizeof (crc_skip_t));
    }
    return (ret);
}

static mddb_bf_t *
allocbuffer(
    mddb_set_t  *s,
    int     sleepflag
)
{
    mddb_bf_t   *bfp;

    while ((bfp = s->s_freebufhead) == NULL) {
        if (sleepflag == MDDB_NOSLEEP)
            return ((mddb_bf_t *)NULL);
        ++s->s_bufmisses;
#ifdef  DEBUG
        if (s->s_bufmisses == 1)
            cmn_err(CE_NOTE,
                "md: mddb: set %u sleeping for buffer", s->s_setno);
#endif
        s->s_bufwakeup = 1;
        cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
    }
    s->s_freebufhead = bfp->bf_next;
    bzero((caddr_t)bfp, sizeof (*bfp));
    bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
    bfp->bf_buf.b_flags = B_BUSY;   /* initialize flags */
    return (bfp);
}

static void
freebuffer(
    mddb_set_t      *s,
    mddb_bf_t   *bfp
)
{
    bfp->bf_next = s->s_freebufhead;
    s->s_freebufhead = bfp;
    if (s->s_bufwakeup) {
        cv_broadcast(&s->s_buf_cv);
        s->s_bufwakeup = 0;
    }
}


static void
blkbusy(
    mddb_set_t  *s,
    mddb_block_t    blk
)
{
    int     bit, byte;

    s->s_freeblkcnt--;
    byte = blk / 8;
    bit = 1 << (blk & 7);
    ASSERT(! (s->s_freebitmap[byte] & bit));
    s->s_freebitmap[byte] |= bit;
}

static void
blkfree(
    mddb_set_t  *s,
    mddb_block_t    blk
)
{
    int     bit, byte;

    s->s_freeblkcnt++;
    byte = blk / 8;
    bit = 1 << (blk & 7);
    ASSERT(s->s_freebitmap[byte] & bit);
    s->s_freebitmap[byte] &= ~bit;
}

static int
blkcheck(
    mddb_set_t  *s,
    mddb_block_t    blk
)
{
    int     bit, byte;

    byte = blk / 8;
    bit = 1 << (blk & 7);
    return (s->s_freebitmap[byte] & bit);
}

/*
 * not fast but simple
 */
static mddb_block_t
getfreeblks(
    mddb_set_t  *s,
    size_t      count
)
{
    int     i;
    size_t      contig;

    contig = 0;
    for (i = 0; i < s->s_totalblkcnt; i++) {
        if (blkcheck(s, i)) {
            contig = 0;
        } else {
            contig++;
            if (contig == count) {
                contig = i - count + 1;
                for (i = (int)contig; i < contig + count; i++)
                    blkbusy(s, i);
                return ((mddb_block_t)contig);
            }
        }
    }
    return (0);
}

static void
computefreeblks(
    mddb_set_t  *s
)
{
    mddb_db_t   *dbp;
    mddb_de_ic_t    *dep;
    int     i;
    int     minblks;
    int     freeblks;
    mddb_mb_ic_t    *mbip;
    mddb_lb_t   *lbp;
    mddb_block_t    maxblk;
    mddb_did_db_t   *did_dbp;
    int     nblks;

    minblks = 0;
    lbp = s->s_lbp;
    maxblk = 0;

    /*
     * Determine the max number of blocks.
     */
    nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
    /*
     * go through and find highest logical block
     */
    for (dbp = s->s_dbp; dbp != 0;  dbp = dbp->db_next) {
        if (dbp->db_blknum > maxblk)
            maxblk = dbp->db_blknum;
        for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
            for (i = 0; i < dep->de_blkcount; i++)
                if (dep->de_blks[i] > maxblk)
                    maxblk = dep->de_blks[i];
    }

    for (i = 0; i < lbp->lb_loccnt; i++) {
        mddb_locator_t  *lp = &lbp->lb_locators[i];

        if ((lp->l_flags & MDDB_F_DELETED) ||
            (lp->l_flags & MDDB_F_EMASTER))
            continue;

        freeblks = 0;
        for (mbip = s->s_mbiarray[i]; mbip != NULL;
            mbip = mbip->mbi_next) {
            freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
        }
        if (freeblks == 0)  /* this happen when there is no */
            continue;   /*  master blk      */

        if (freeblks <= maxblk) {
            lp->l_flags |= MDDB_F_TOOSMALL;
            lp->l_flags &= ~MDDB_F_ACTIVE;
        }

        if (freeblks < minblks || minblks == 0)
            minblks = freeblks;
    }
    /*
     * set up reasonable freespace if no
     * data bases exist
     */
    if (minblks == 0)
        minblks = 100;
    if (minblks > nblks)
        minblks = nblks;
    s->s_freeblkcnt = minblks;
    s->s_totalblkcnt = minblks;
    if (! s->s_freebitmapsize) {
        s->s_freebitmapsize = nblks / 8;
        s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
            KM_SLEEP);
    }
    bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);

    /* locator block sectors */
    for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
        blkbusy(s, i);

    /* locator name sectors */
    for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
        blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));

    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        /* locator block device id information */
        for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
            blkbusy(s, (s->s_lbp->lb_didfirstblk + i));

        /* disk blocks containing actual device ids */
        did_dbp = s->s_did_icp->did_ic_dbp;
        while (did_dbp) {
            for (i = 0; i < did_dbp->db_blkcnt; i++) {
                blkbusy(s, did_dbp->db_firstblk + i);
            }
            did_dbp = did_dbp->db_next;
        }
    }

    /* Only use data tags if not a MN set */
    if (!(lbp->lb_flags & MDDB_MNSET)) {
        /* Found a bad tag, do NOT mark the data tag blks busy here */
        if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
            for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
                blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
        }
    }

    /* directory block/entry sectors */
    for (dbp = s->s_dbp; dbp != 0;  dbp = dbp->db_next) {
        blkbusy(s, dbp->db_blknum);
        for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
            for (i = 0; i < dep->de_blkcount; i++)
                blkbusy(s, dep->de_blks[i]);
    }
}

/*
 * Add free space to the device id incore free list.
 * Called:
 *    - During startup when all devid blocks are temporarily placed on the
 *       free list
 *    - After a devid has been deleted via the metadb command.
 *    - When mddb_devid_free_get adds unused space from a disk block
 *       to free list
 */
static int
mddb_devid_free_add(
    mddb_set_t *s,
    uint_t firstblk,
    uint_t offset,
    uint_t length
)
{
    mddb_did_free_t *did_freep;

    if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
        return (0);
    }

    did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
        KM_SLEEP);
    did_freep->free_blk = firstblk;
    did_freep->free_offset = offset;
    did_freep->free_length = length;
    did_freep->free_next = s->s_did_icp->did_ic_freep;
    s->s_did_icp->did_ic_freep = did_freep;

    return (0);
}

/*
 * Remove specific free space from the device id incore free list.
 * Called at startup (after all devid blocks have been placed on
 * free list) in order to remove the free space from the list that
 * contains actual devids.
 * Returns 0 if area successfully removed.
 * Returns 1 if no matching area is found - so nothing removed.
 */
static int
mddb_devid_free_delete(
    mddb_set_t *s,
    uint_t firstblk,
    uint_t offset,
    uint_t length
)
{
    int     block_found = 0;
    mddb_did_free_t *did_freep1;        /* next free block */
    mddb_did_free_t *did_freep2 = 0;    /* previous free block */
    mddb_did_free_t *did_freep_before;  /* area before offset, len */
    mddb_did_free_t *did_freep_after;   /* area after offset, len */
    uint_t      old_length;

    if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
        return (1);
    }

    /* find free block for this devid */
    did_freep1 = s->s_did_icp->did_ic_freep;
    while (did_freep1) {
        /*
         * Look through free list of <block, offset, length> to
         * find our entry in the free list.  Our entry should
         * exist since the entire devid block was placed into
         * this free list at startup.  This code is just removing
         * the non-free (in-use) portions of the devid block so
         * that the remaining linked list does indeed just
         * contain a free list.
         *
         * Our entry has been found if
         *   - the blocks match,
         *   - the offset (starting address) in the free list is
         *  less than the offset of our entry and
         *   - the length+offset (ending address) in the free list is
         *  greater than the length+offset of our entry.
         */
        if ((did_freep1->free_blk == firstblk) &&
            (did_freep1->free_offset <= offset) &&
            ((did_freep1->free_length + did_freep1->free_offset) >=
            (length + offset))) {
            /* Have found our entry - remove from list */
            block_found = 1;
            did_freep_before = did_freep1;
            old_length = did_freep1->free_length;
            /* did_freep1 - pts to next free block */
            did_freep1 = did_freep1->free_next;
            if (did_freep2) {
                did_freep2->free_next = did_freep1;
            } else {
                s->s_did_icp->did_ic_freep = did_freep1;
            }

            /*
             * did_freep_before points to area in block before
             * offset, length.
             */
            did_freep_before->free_length = offset -
                did_freep_before->free_offset;
            /*
             * did_freep_after points to area in block after
             * offset, length.
             */
            did_freep_after = (mddb_did_free_t *)kmem_zalloc
                (sizeof (mddb_did_free_t), KM_SLEEP);
            did_freep_after->free_blk = did_freep_before->free_blk;
            did_freep_after->free_offset = offset + length;
            did_freep_after->free_length = old_length - length -
                did_freep_before->free_length;
            /*
             * Add before and after areas to free list
             * If area before or after offset, length has length
             * of 0, that entry is not added.
             */
            if (did_freep_after->free_length) {
                did_freep_after->free_next = did_freep1;
                if (did_freep2) {
                    did_freep2->free_next =
                        did_freep_after;
                } else {
                    s->s_did_icp->did_ic_freep =
                        did_freep_after;
                }
                did_freep1 = did_freep_after;
            } else {
                kmem_free(did_freep_after,
                    sizeof (mddb_did_free_t));
            }

            if (did_freep_before->free_length) {
                did_freep_before->free_next = did_freep1;
                if (did_freep2) {
                    did_freep2->free_next =
                        did_freep_before;
                } else {
                    s->s_did_icp->did_ic_freep =
                        did_freep_before;
                }
            } else {
                kmem_free(did_freep_before,
                    sizeof (mddb_did_free_t));
            }
            break;
        } else {
            did_freep2 = did_freep1;
            did_freep1 = did_freep1->free_next;
        }
    }
    if (block_found == 0) {
        return (1);
    } else {
        return (0);
    }
}

/*
 * Find free space of devid length and remove free space from list.
 * Return a pointer to the previously free area.
 *
 * If there's not enough free space on the free list, get an empty
 * disk block, put the empty disk block on the did_ic_dbp linked list,
 * and add the disk block space not used for devid to the free list.
 *
 * Return pointer to address (inside disk block) of free area for devid.
 * Return 0 if error.
 */
static caddr_t
mddb_devid_free_get(
    mddb_set_t *s,
    uint_t len,
    uint_t *blk,
    uint_t *cnt,
    uint_t *offset
)
{
    mddb_did_free_t *freep, *freep2;
    mddb_did_db_t   *dbp;
    uint_t      blk_cnt, blk_num;
    ddi_devid_t devid_ptr = NULL;

    if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
        return (0);
    }

    freep = s->s_did_icp->did_ic_freep;
    freep2 = (mddb_did_free_t *)NULL;
    while (freep) {
        /* found a free area - remove from free list */
        if (len <= freep->free_length) {
            *blk = freep->free_blk;
            *offset = freep->free_offset;
            /* find disk block pointer that contains free area */
            dbp = s->s_did_icp->did_ic_dbp;
            while (dbp) {
                if (dbp->db_firstblk == *blk)
                    break;
                else
                    dbp = dbp->db_next;
            }
            /*
             * If a disk block pointer can't be found - something
             * is wrong, so don't use this free space.
             */
            if (dbp == NULL) {
                freep2 = freep;
                freep = freep->free_next;
                continue;
            }

            devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
            *cnt = dbp->db_blkcnt;

            /* Update free list information */
            freep->free_offset += len;
            freep->free_length -= len;
            if (freep->free_length == 0) {
                if (freep2) {
                    freep2->free_next =
                        freep->free_next;
                } else {
                    s->s_did_icp->did_ic_freep =
                        freep->free_next;
                }
                kmem_free(freep, sizeof (mddb_did_free_t));
            }
            break;
        }
        freep2 = freep;
        freep = freep->free_next;
    }

    /* Didn't find a free spot */
    if (freep == NULL) {
        /* get free logical disk blk in replica */
        blk_cnt = btodb(len + (MDDB_BSIZE - 1));
        blk_num = getfreeblks(s, blk_cnt);
        if (blk_num == 0)
            return (0);

        /* Add disk block to disk block linked list */
        dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
        dbp->db_firstblk = blk_num;
        dbp->db_blkcnt = blk_cnt;
        dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
        dbp->db_next = s->s_did_icp->did_ic_dbp;
        s->s_did_icp->did_ic_dbp = dbp;
        devid_ptr = (ddi_devid_t)dbp->db_ptr;

        /* Update return values */
        *blk = blk_num;
        *offset = 0;
        *cnt = blk_cnt;

        /* Add unused part of block to free list */
        (void) mddb_devid_free_add(s, blk_num,
            len, (dbtob(blk_cnt) - len));
    }

    return ((caddr_t)devid_ptr);
}

/*
 * Add device id information for locator index to device id area in set.
 * Get free area to store device id from free list.   Update checksum
 * for mddb_did_blk.
 *
 * This routine does not write any data out to disk.
 * After this routine has been called, the routine, writelocall, should
 * be called to write both the locator block and device id area out
 * to disk.
 */
static int
mddb_devid_add(
    mddb_set_t  *s,
    uint_t      index,
    ddi_devid_t devid,
    char        *minor_name
)
{
    uint_t      devid_len;
    uint_t      blk, offset;
    ddi_devid_t devid_ptr;
    mddb_did_info_t *did_info;
    uint_t      blkcnt, i;
    mddb_did_blk_t  *did_blk;

    if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
        return (1);
    }
    if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
        return (1);

    /* Check if device id has already been added */
    did_blk = s->s_did_icp->did_ic_blkp;
    did_info = &(did_blk->blk_info[index]);
    if (did_info->info_flags & MDDB_DID_EXISTS)
        return (0);

    devid_len = ddi_devid_sizeof(devid);
    devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
        devid_len, &blk, &blkcnt, &offset);

    if (devid_ptr == NULL) {
        return (1);
    }

    /* Copy devid into devid free area */
    for (i = 0; i < devid_len; i++)
        ((char *)devid_ptr)[i] = ((char *)devid)[i];

    /* Update mddb_did_info area for new device id */
    did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;

    /*
     * Only set UPDATED flag for non-replicated import cases.
     * This allows the side locator driver name index to get
     * updated in load_old_replicas.
     */
    if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
        did_info->info_flags |= MDDB_DID_UPDATED;

    did_info->info_firstblk = blk;
    did_info->info_blkcnt = blkcnt;
    did_info->info_offset = offset;
    did_info->info_length = devid_len;
    (void) strcpy(did_info->info_minor_name, minor_name);
    crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);

    /* Add device id pointer to did_ic_devid array */
    s->s_did_icp->did_ic_devid[index] = devid_ptr;

    return (0);
}


/*
 * Delete device id information for locator index from device id area in set.
 * Add device id space to free area.
 *
 * This routine does not write any data out to disk.
 * After this routine has been called, the routine, writelocall, should
 * be called to write both the locator block and device id area out
 * to disk.
 */
static int
mddb_devid_delete(mddb_set_t *s, uint_t index)
{
    mddb_did_info_t *did_info;
    mddb_did_blk_t  *did_blk;

    if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
        return (1);
    }

    /* Get device id information from mddb_did_blk */
    did_blk = s->s_did_icp->did_ic_blkp;
    did_info = &(did_blk->blk_info[index]);

    /*
     * Ensure that the underlying device supports device ids
     * before arbitrarily removing them.
     */
    if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
        return (1);
    }

    /* Remove device id information from mddb_did_blk */
    did_info->info_flags = 0;

    /* Remove device id from incore area */
    s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;

    /* Add new free space in disk block to free list */
    (void) mddb_devid_free_add(s, did_info->info_firstblk,
        did_info->info_offset, did_info->info_length);

    return (0);
}

/*
 * Check if there is a device id for a locator index.
 *
 * Caller of this routine should not free devid or minor_name since
 * these will point to internal data structures that should not
 * be freed.
 */
static int
mddb_devid_get(
    mddb_set_t *s,
    uint_t index,
    ddi_devid_t *devid,
    char **minor_name
)
{
    mddb_did_info_t *did_info;

    if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
        return (0);
    }
    did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);

    if (did_info->info_flags & MDDB_DID_EXISTS) {
        *devid = s->s_did_icp->did_ic_devid[index];
        *minor_name =
            s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
        return (1);
    } else
        return (0);


}

/*
 * Check if device id is valid on current system.
 * Needs devid, previously known dev_t and current minor_name.
 *
 * Success:
 *  Returns 0 if valid device id is found and updates
 *  dev_t if the dev_t associated with the device id is
 *  different than dev_t.
 * Failure:
 *  Returns 1 if device id not valid on current system.
 */
static int
mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
{
    int     retndevs;
    dev_t       *ddi_devs;
    int     devid_flag = 0;
    int         cnt;

    if (dev == 0)
        return (1);
    /*
     * See if devid is valid in the current system.
     * If so, set dev to match the devid.
     */
    if (ddi_lyr_devid_to_devlist(devid, minor_name,
        &retndevs, &ddi_devs) == DDI_SUCCESS) {
        if (retndevs > 0) {
            /* devid is valid to use */
            devid_flag = 1;
            /* does dev_t in list match dev */
            cnt = 0;
            while (cnt < retndevs) {
                if (*dev == md_expldev(ddi_devs[cnt]))
                    break;
                cnt++;
            }
            /*
             * If a different dev_t, then setup
             * new dev and new major name
             */
            if (cnt == retndevs) {
                *dev = md_expldev(ddi_devs[0]);
            }
            ddi_lyr_free_devlist(ddi_devs, retndevs);
        }
    }
    if (devid_flag)
        return (0);
    else
        return (1);
}


/*
 * Free the devid incore data areas
 */
static void
mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
{
    mddb_did_free_t *did_freep1, *did_freep2;
    mddb_did_db_t   *did_dbp1, *did_dbp2;
    mddb_did_ic_t   *icp = *did_icp;

    if (icp) {
        if (icp->did_ic_blkp) {
            kmem_free((caddr_t)icp->did_ic_blkp,
                dbtob(lbp->lb_didblkcnt));
            icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
        }

        if (icp->did_ic_dbp) {
            did_dbp1 = icp->did_ic_dbp;
            while (did_dbp1) {
                did_dbp2 = did_dbp1->db_next;
                kmem_free((caddr_t)did_dbp1->db_ptr,
                    dbtob(did_dbp1->db_blkcnt));
                kmem_free((caddr_t)did_dbp1,
                    sizeof (mddb_did_db_t));
                did_dbp1 = did_dbp2;
            }
        }

        if (icp->did_ic_freep) {
            did_freep1 = icp->did_ic_freep;
            while (did_freep1) {
                did_freep2 = did_freep1->free_next;
                kmem_free((caddr_t)did_freep1,
                    sizeof (mddb_did_free_t));
                did_freep1 = did_freep2;
            }
        }

        kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
        *did_icp = (mddb_did_ic_t *)NULL;
    }

}

static daddr_t
getphysblk(
    mddb_block_t        blk,
    mddb_mb_ic_t        *mbip
)
{
    mddb_mb_t   *mbp = &(mbip->mbi_mddb_mb);

    while (blk >= mbp->mb_blkcnt) {
        if (! mbip->mbi_next)
            return ((daddr_t)-1);   /* no such block */
        blk -= mbp->mb_blkcnt;
        mbip = mbip->mbi_next;
        mbp = &(mbip->mbi_mddb_mb);
    }

    if (blk >= mbp->mb_blkmap.m_consecutive)
        return ((daddr_t)-1);   /* no such block */

    return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
}

/*
 * when a buf header is passed in the new buffer must be
 * put on the front of the chain. writerec counts on it
 */
static int
putblks(
    mddb_set_t  *s,     /* incore db set structure */
    caddr_t     buffer,     /* adr of buffer to be written */
    daddr_t     blk,        /* block number for first block */
    int     cnt,        /* number of blocks to be written */
    md_dev64_t  device,     /* device to be written to */
    mddb_bf_t   **bufhead   /* if non-zero then ASYNC I/O */
                    /*    and put buf address here */
)
{
    buf_t       *bp;
    mddb_bf_t   *bfp;
    int     err = 0;

    bfp = allocbuffer(s, MDDB_SLEEPOK);
    bp = &bfp->bf_buf;
    bp->b_bcount = MDDB_BSIZE * cnt;
    bp->b_un.b_addr = buffer;
    bp->b_blkno = blk;
    bp->b_edev = md_dev64_to_dev(device);
    /*
     * if a header for a buf chain is passed in this is async io.
     * currently only done for optimize  records
     */
    if (bufhead) {
        bfp->bf_next = *bufhead;
        *bufhead = bfp;
        (void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
        return (0);
    }
    err = mddb_rwdata(s, B_WRITE, bp);
    freebuffer(s, bfp);
    if (err) {
        SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
            s->s_setno, device);
        return (MDDB_F_EWRITE);
    }
    return (0);
}

/*
 * wrtblklst - takes an array of logical block numbers
 *      and writes the buffer to those blocks (scatter).
 * If called during upgrade, this routine expects a
 * non-translated (aka target) dev.
 */
static int
wrtblklst(
    mddb_set_t  *s,     /* incore set structure */
    caddr_t     buffer,     /* buffer to be written (record blk) */
    mddb_block_t    blka[],     /* list of logical blks for record */
    daddr_t     cnt,        /* number of logical blks */
    const int   li,     /* locator index */
    mddb_bf_t   **bufhead,  /* if non-zero then ASYNC I/O */
                    /*    and put buf address here */
    int     master_only /* allow only master node to write */
)
{
    daddr_t     blk;
    daddr_t     blk1;
    int     err = 0;
    int     cons;
    mddb_lb_t   *lbp = s->s_lbp;
    mddb_locator_t  *lp = &lbp->lb_locators[li];
    md_dev64_t  dev;
    mddb_mb_ic_t    *mbip = s->s_mbiarray[li];

    /*
     * If a MN diskset and only the master can write,
     * then a non-master node will just return success.
     */
    if (lbp->lb_flags & MDDB_MNSET) {
        if (master_only == MDDB_WR_ONLY_MASTER) {
            /* return successfully if we aren't the master */
            if (!(md_set[s->s_setno].s_am_i_master)) {
                return (0);
            }
        }
        if (mbip == NULL)
            return (MDDB_F_EWRITE);
    }

    dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
    if (dev == NODEV64) {
        return (1);
    }

    blk = getphysblk(blka[0], mbip);
    ASSERT(blk >= 0);

    cons = 1;
    while (cnt) {
        if (cons != cnt) {
            blk1 = getphysblk(blka[cons], mbip);
            ASSERT(blk1 >= 0);
            if ((blk + cons) == blk1) {
                cons++;
                continue;
            }
        }
        if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
            /*
             * If an MN diskset and any_node_can_write
             * then this request is coming from writeoptrecord
             * and l_flags field should not be updated.
             * l_flags will be updated as a result of sending
             * a class1 message to the master.  Setting l_flags
             * here will cause slave to be out of sync with
             * master.
             *
             * Otherwise, set the error in l_flags
             * (this occurs if this is not a MN diskset or
             * only_master_can_write is set).
             */
            if ((!(lbp->lb_flags & MDDB_MNSET)) ||
                (master_only == MDDB_WR_ONLY_MASTER)) {
                lp->l_flags |= MDDB_F_EWRITE;
            }
            return (err);
        }
        if (bufhead)
            (*bufhead)->bf_locator = lp;

        buffer += MDDB_BSIZE * cons;
        cnt -= cons;
        blka += cons;
        if (cnt) {
            blk = getphysblk(blka[0], mbip);
            ASSERT(blk >= 0);
        }
        cons = 1;
    }

    return (0);
}

/*
 * writeblks - takes a logical block number/block count pair
 *      and writes the buffer to those contiguous logical blocks.
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 */
static int
writeblks(
    mddb_set_t  *s,     /* incore set structure */
    caddr_t     buffer,     /* buffer to be written */
    mddb_block_t    blk,        /* starting logical block number */
    int     cnt,        /* number of log blocks to be written */
    const int   li,     /* locator index */
    int     master_only /* allow only master node to write */
)
{
    daddr_t     physblk;
    int     err = 0;
    int     i;
    mddb_lb_t   *lbp = s->s_lbp;
    mddb_locator_t  *lp = &lbp->lb_locators[li];
    md_dev64_t  dev;
    mddb_block_t    *blkarray;
    int     size;
    int     ret;

    /*
     * If a MN diskset and only the master can write,
     * then a non-master node will just return success.
     */
    if ((lbp->lb_flags & MDDB_MNSET) &&
        (master_only == MDDB_WR_ONLY_MASTER)) {
        /* return successfully if we aren't the master */
        if (!(md_set[s->s_setno].s_am_i_master)) {
            return (0);
        }
    }

    dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
    if (dev == NODEV64) {
        return (1);
    }

    if (cnt > 1) {
        size = sizeof (mddb_block_t) * cnt;
        blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
        for (i = 0; i < cnt; i++)
            blkarray[i] = blk + i;
        ret = wrtblklst(s, buffer, blkarray, cnt,
            li, 0, MDDB_WR_ONLY_MASTER);
        kmem_free(blkarray, size);
        return (ret);
    }
    physblk = getphysblk(blk, s->s_mbiarray[li]);
    ASSERT(physblk > 0);
    if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
        lp->l_flags |= MDDB_F_EWRITE;
        return (err);
    }
    return (0);
}

/*
 * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
 */
static int
writeall(
    mddb_set_t  *s,     /* incore set structure */
    caddr_t     buffer,     /* buffer to be written */
    mddb_block_t    block,      /* starting logical block number */
    int     cnt,        /* number of log blocks to be written */
    int     master_only /* allow only master node to write */
)
{
    int     li;
    int     err = 0;
    mddb_lb_t   *lbp = s->s_lbp;

    for (li = 0; li < lbp->lb_loccnt; li++) {
        mddb_locator_t  *lp = &lbp->lb_locators[li];

        if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
            (lp->l_flags & MDDB_F_EWRITE))
            continue;

        err |= writeblks(s, buffer, block, cnt, li, master_only);
    }

    return (err);
}

/*
 * writelocall - write the locator block and device id information (if
 * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
 *
 * Increments the locator block's commitcnt.  Updates the device id area's
 * commitcnt if the replica is in device id format.  Regenerates the
 * checksums after updating the commitcnt(s).
 */
static int
writelocall(
    mddb_set_t  *s  /* incore set structure */
)
{
    int     li;
    int     err = 0;
    mddb_lb_t   *lbp = s->s_lbp;
    mddb_did_blk_t  *did_blk;
    mddb_did_db_t   *did_dbp;

    s->s_lbp->lb_commitcnt++;
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        did_blk = s->s_did_icp->did_ic_blkp;
        did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
        crcgen(did_blk, &did_blk->blk_checksum,
            dbtob(lbp->lb_didblkcnt), NULL);
    }
    crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);

    for (li = 0; li < lbp->lb_loccnt; li++) {
        mddb_locator_t  *lp = &lbp->lb_locators[li];

        if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
            (lp->l_flags & MDDB_F_EWRITE))
            continue;

        if (lbp->lb_flags & MDDB_DEVID_STYLE) {
            /* write out blocks containing actual device ids */
            did_dbp = s->s_did_icp->did_ic_dbp;
            while (did_dbp) {
                err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
                    did_dbp->db_firstblk,
                    did_dbp->db_blkcnt, li,
                    MDDB_WR_ONLY_MASTER);
                did_dbp = did_dbp->db_next;
            }

            /* write out device id area block */
            err |= writeblks(s, (caddr_t)did_blk,
                lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
                MDDB_WR_ONLY_MASTER);
        }
        /* write out locator block */
        err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
            MDDB_WR_ONLY_MASTER);
    }

    /*
     * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
     * in the mddb_set structure to show that the locator block has
     * been changed.
     */

    if ((lbp->lb_flags & MDDB_MNSET) &&
        (md_set[s->s_setno].s_am_i_master)) {
        s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
    }
    return (err);
}

/*
 * If called during upgrade, this routine expects a translated
 * (aka miniroot) dev.
 */
static int
getblks(
    mddb_set_t  *s, /* incore db set structure */
    caddr_t     buffer, /* buffer to read data into */
    md_dev64_t  device, /* device to read from */
    daddr_t     blk,    /* physical block number to read */
    int     cnt,    /* number of blocks to read */
    int     flag    /* flags for I/O */
)
{
    buf_t       *bp;
    mddb_bf_t   *bfp;
    int     err = 0;

    bfp = allocbuffer(s, MDDB_SLEEPOK); /* this will never sleep */
    bp = &bfp->bf_buf;
    bp->b_bcount = MDDB_BSIZE * cnt;
    bp->b_un.b_addr = buffer;
    bp->b_blkno = blk;
    bp->b_edev = md_dev64_to_dev(device);
    err = mddb_rwdata(s, (B_READ | flag), bp);
    freebuffer(s, bfp);
    if (err) {
        SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
            s->s_setno, device);
        return (MDDB_F_EREAD);
    }
    return (0);
}

/*
 * readblklst - takes an array of logical block numbers
 *      and reads those blocks (gather) into the buffer.
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 */
static int
readblklst(
    mddb_set_t  *s, /* incore set structure */
    caddr_t     buffer, /* buffer to be read (record block) */
    mddb_block_t    blka[], /* list of logical blocks to be read */
    daddr_t     cnt,    /* number of logical blocks */
    int     li, /* locator index */
    int     flag    /* flags for I/O */
)
{
    daddr_t     blk;
    daddr_t     blk1;
    int     err = 0;
    int     cons;
    md_dev64_t  dev;
    mddb_mb_ic_t    *mbip;

    mbip = s->s_mbiarray[li];
    dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
    dev = md_xlate_targ_2_mini(dev);
    if (dev == NODEV64) {
        return (1);
    }

    blk = getphysblk(blka[0], mbip);
    ASSERT(blk >= 0);

    cons = 1;
    while (cnt) {
        if (cons != cnt) {
            blk1 = getphysblk(blka[cons], mbip);
            ASSERT(blk1 >= 0);
            if ((blk + cons) == blk1) {
                cons++;
                continue;
            }
        }
        if (err = getblks(s, buffer, dev, blk, cons, flag))
            return (err);
        buffer += MDDB_BSIZE * cons;
        cnt -= cons;
        blka += cons;
        if (cnt) {
            blk = getphysblk(blka[0], mbip);
            ASSERT(blk >= 0);
        }
        cons = 1;
    }
    return (0);
}

/*
 * readblks - takes a logical block number/block count pair
 *      and reads those contiguous logical blocks into the buffer.
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 */
static int
readblks(
    mddb_set_t  *s, /* incore set structure */
    caddr_t     buffer, /* buffer to be read into */
    mddb_block_t    blk,    /* logical block number to be read */
    int     cnt,    /* number of logical blocks to be read */
    int     li  /* locator index */
)
{
    daddr_t     physblk;
    md_dev64_t  device;
    int     i;
    mddb_block_t    *blkarray;
    int     size;
    int     ret;

    if (cnt > 1) {
        size = sizeof (mddb_block_t) * cnt;
        blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
        for (i = 0; i < cnt; i++)
            blkarray[i] = blk + i;
        ret = readblklst(s, buffer, blkarray, cnt, li, 0);
        kmem_free(blkarray, size);
        return (ret);
    }
    physblk = getphysblk(blk, s->s_mbiarray[li]);
    ASSERT(physblk > 0);
    device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
    device = md_xlate_targ_2_mini(device);
    if (device == NODEV64) {
        return (1);
    }
    return (getblks(s, buffer, device, physblk, 1, 0));
}

static void
single_thread_start(
    mddb_set_t  *s
)
{
    while (s->s_singlelockgotten) {
        s->s_singlelockwanted++;
        cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
    }
    s->s_singlelockgotten++;
}

static void
single_thread_end(
    mddb_set_t  *s
)
{
    ASSERT(s->s_singlelockgotten);
    s->s_singlelockgotten = 0;
    if (s->s_singlelockwanted) {
        s->s_singlelockwanted = 0;
        cv_broadcast(&s->s_single_thread_cv);
    }
}

static size_t
sizeofde(
    mddb_de_ic_t    *dep
)
{
    size_t      size;

    size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
        sizeof (mddb_block_t) * dep->de_blkcount;
    return (size);
}

static size_t
sizeofde32(
    mddb_de32_t *dep
)
{
    size_t      size;

    size = sizeof (*dep) - sizeof (dep->de32_blks) +
        sizeof (mddb_block_t) * dep->de32_blkcount;
    return (size);
}

static mddb_de32_t *
nextentry(
    mddb_de32_t *dep
)
{
    mddb_de32_t *ret;

    ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
    return (ret);
}

static void
create_db32rec(
    mddb_db32_t *db32p,
    mddb_db_t *dbp
)
{
    mddb_de_ic_t *dep;
    mddb_de32_t *de32p;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

    dbtodb32(dbp, db32p);
    if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
        db32p->db32_firstentry = 0x4;
    de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
        + sizeof (db32p->db32_firstentry)));
    for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
        detode32(dep, de32p);
        if ((dep->de_next != NULL) && (de32p->de32_next == 0))
            de32p->de32_next = 0x4;
        de32p = nextentry(de32p);
    }
    ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
}

/*
 * If called during upgrade, this routine expects a translated
 * (aka miniroot) dev.
 * If master blocks are found, set the mn_set parameter to 1 if the
 * the master block revision number is MDDB_REV_MNMB; otherwise,
 * set it to 0.
 * If master blocks are not found, do not change the mnset parameter.
 */
static mddb_mb_ic_t *
getmasters(
    mddb_set_t  *s,
    md_dev64_t  dev,
    daddr_t     blkno,
    uint_t      *flag,
    int     *mn_set
)
{
    mddb_mb_ic_t    *mbi = NULL;
    mddb_mb_t   *mb;
    int     error = 0;
    ddi_devid_t devid;


    if (mddb_devopen(dev)) {
        if (flag)
            *flag |= MDDB_F_EMASTER;
        return ((mddb_mb_ic_t *)NULL);
    }


    mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
    mb = &(mbi->mbi_mddb_mb);
    if (error = getblks(s, (caddr_t)mb, dev, blkno,
        btodb(MDDB_BSIZE), 0)) {
        error |= MDDB_F_EMASTER;
    }
    if (mb->mb_magic != MDDB_MAGIC_MB) {
        error = MDDB_F_EFMT | MDDB_F_EMASTER;
    }
    /* Check for MDDB_REV_MNMB and lower */
    if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
        error = MDDB_F_EFMT | MDDB_F_EMASTER;
    }
    if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
        error = MDDB_F_EFMT | MDDB_F_EMASTER;
    }

    if (!(md_get_setstatus(s->s_setno) &
        (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
        (mb->mb_setno != s->s_setno)) {
        error = MDDB_F_EFMT | MDDB_F_EMASTER;
    }
    if (mb->mb_blkno != blkno) {
        error = MDDB_F_EFMT | MDDB_F_EMASTER;
    }
    mb->mb_next = NULL;
    mbi->mbi_next = NULL;

    if (error)
        goto out;

    /*
     * Check the md_devid_destroy and md_keep_repl_state flags
     * to see if we need to regen the devid or not.
     *
     * Don't care about devid in local set since it is not used
     * and this should not be part of set importing
     */
    if ((s->s_setno != MD_LOCAL_SET) &&
        !(md_get_setstatus(s->s_setno) &
        (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
        /*
         * Now check the destroy flag. We also need to handle
         * the case where the destroy flag is reset after the
         * destroy
         */
        if (md_devid_destroy || (mb->mb_devid_len == 0)) {

            if (md_devid_destroy) {
                bzero(mb->mb_devid, mb->mb_devid_len);
                mb->mb_devid_len = 0;
            }

            /*
             * Try to regenerate it if the 'keep' flag is not set
             */
            if (!md_keep_repl_state) {
                if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
                    &devid) == DDI_SUCCESS) {
                    mb->mb_devid_len =
                        ddi_devid_sizeof(devid);
                    bcopy(devid, mb->mb_devid,
                        mb->mb_devid_len);
                    ddi_devid_free(devid);
                } else {
                    error = MDDB_F_EFMT | MDDB_F_EMASTER;
                }
            }

            crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);

            /*
             * Push
             */
            if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
                error = MDDB_F_EFMT | MDDB_F_EMASTER;
            }
        }
    }

    if (! error) {
        /* Set mn_set parameter to 1 if a MN set */
        if (mb->mb_revision == MDDB_REV_MNMB)
            *mn_set = 1;
        else
            *mn_set = 0;
        return (mbi);
    }

out:
    /* Error Out */
    if (flag)
        *flag |= error;

    kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
    mddb_devclose(dev);
    return ((mddb_mb_ic_t *)NULL);
}

static int
getrecord(
    mddb_set_t  *s,
    mddb_de_ic_t    *dep,
    int     li
)
{
    int     err = 0;
    mddb_rb32_t *rbp;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif


    dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
    rbp = dep->de_rb;

    err = readblklst(s, (caddr_t)rbp, dep->de_blks,
        dep->de_blkcount, li, 0);
    if (err) {
        return (MDDB_F_EDATA | err);
    }
    if (rbp->rb_magic != MDDB_MAGIC_RB) {
        return (MDDB_F_EFMT | MDDB_F_EDATA);
    }
    if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
        (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
        (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
        (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
        return (MDDB_F_EFMT | MDDB_F_EDATA);
    }
    /* Check crc for this record */
    if (rec_crcchk(s, dep, rbp)) {
        return (MDDB_F_EFMT | MDDB_F_EDATA);
    }
    return (0);
}

/*
 * Code to read in the locator name information
 */
static int
readlocnames(
    mddb_set_t  *s,
    int     li
)
{
    mddb_ln_t   *lnp;
    int     err = 0;
    mddb_block_t    ln_blkcnt, ln_blkno;

    /*
     * read in the locator name blocks
     */
    s->s_lnp = NULL;

    ln_blkno = s->s_lbp->lb_lnfirstblk;
    ln_blkcnt = s->s_lbp->lb_lnblkcnt;
    lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);

    err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
    if (err) {
        err |= MDDB_F_EDATA;
        goto out;
    }
    if (lnp->ln_magic != MDDB_MAGIC_LN) {
        err = MDDB_F_EDATA | MDDB_F_EFMT;
        goto out;
    }
    if (s->s_lbp->lb_flags & MDDB_MNSET) {
        if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
            err = MDDB_F_EDATA | MDDB_F_EFMT;
            goto out;
        }
    } else {
        if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
            err = MDDB_F_EDATA | MDDB_F_EFMT;
            goto out;
        }
    }
    if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
        err = MDDB_F_EDATA | MDDB_F_EFMT;
        goto out;
    }
out:
    /*
     *  if error occurred in locator name blocks free them
     *  and return
     */
    if (err) {
        kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
        return (err);
    }
    s->s_lnp = lnp;
    return (0);
}

/*
 * code to read in a copy of the database.
 */

static int
readcopy(
    mddb_set_t  *s,
    int     li
)
{
    uint_t      blk;
    mddb_db_t   *dbp, *dbp1, *dbhp;
    mddb_db32_t *db32p;
    mddb_de_ic_t    *dep, *dep2;
    mddb_de32_t *de32p, *de32p2;
    int     err = 0;
    uint_t      checksum;


#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

    dbp = NULL;
    dbhp = NULL;
    /*
     *  read in all the directory blocks
     */
    blk = s->s_lbp->lb_dbfirstblk;
    db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);

    for (; blk != 0; blk = dbp->db_nextblk) {
        dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
        if (! dbhp) {
            dbhp = dbp1;
        } else {
            dbp->db_next = dbp1;
        }
        dbp = dbp1;

        err = readblks(s, (caddr_t)db32p, blk, 1, li);
        if (err) {
            err |= MDDB_F_EDATA;
            break;
        }
        db32todb(db32p, dbp);
        if (db32p->db32_magic != MDDB_MAGIC_DB) {
            err = MDDB_F_EDATA | MDDB_F_EFMT;
            break;
        }
        if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
            err = MDDB_F_EDATA | MDDB_F_EFMT;
            break;
        }
        if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
            err = MDDB_F_EDATA | MDDB_F_EFMT;
            break;
        }
        /*
         * first go through and fix up all de_next pointers
         */
        if (dbp->db_firstentry) {

            de32p = (mddb_de32_t *)
                ((void *) ((caddr_t)(&db32p->db32_firstentry)
                + sizeof (db32p->db32_firstentry)));

            dep = (mddb_de_ic_t *)
                kmem_zalloc(sizeof (mddb_de_ic_t) -
                sizeof (mddb_block_t) +
                sizeof (mddb_block_t) * de32p->de32_blkcount,
                KM_SLEEP);
            de32tode(de32p, dep);

            dbp->db_firstentry = dep;
            while (de32p && de32p->de32_next) {

                de32p2 = nextentry(de32p);

                dep2 = (mddb_de_ic_t *)kmem_zalloc(
                    sizeof (mddb_de_ic_t) -
                    sizeof (mddb_block_t) +
                    sizeof (mddb_block_t) *
                    de32p2->de32_blkcount, KM_SLEEP);

                de32tode(de32p2, dep2);

                dep->de_next = dep2;
                dep = dep2;
                de32p = de32p2;
            }
        }
        /*
         * go through and make all of the pointer to record blocks
         * are null;
         */
        for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
            dep->de_rb = NULL;
    }
    kmem_free((caddr_t)db32p, MDDB_BSIZE);
    dbp->db_next = NULL;
    /*
     *  if error occurred in directory blocks free them
     *  and return
     */
    if (err) {
        dbp = dbhp;
        while (dbp) {
            dep = dbp->db_firstentry;
            while (dep) {
                /* No mddb_rb32_t structures yet */
                dep2 = dep->de_next;
                kmem_free((caddr_t)dep, sizeofde(dep));
                dep = dep2;
            }
            dbp1 = dbp->db_next;
            kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
            dbp = dbp1;
        }
        s->s_dbp = NULL;
        return (err);

    }
    /*
     */
    err = 0;
    checksum = MDDB_GLOBAL_XOR;
    for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
        checksum ^= dbp->db_recsum;
        for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
            if (dep->de_flags & MDDB_F_OPT)
                continue;
            err = getrecord(s, dep, li);
            if (err)
                break;
            /* Don't include CHANGELOG in big XOR */
            if (dep->de_flags & MDDB_F_CHANGELOG)
                continue;
            checksum ^= dep->de_rb->rb_checksum;
            checksum ^= dep->de_rb->rb_checksum_fiddle;
        }
        if (err)
            break;
    }
    if (checksum) {
        if (! err)
            err = MDDB_F_EDATA | MDDB_F_EFMT;
    }
    if (err) {
        dbp = dbhp;
        dbhp = NULL;
        while (dbp) {
            dep = dbp->db_firstentry;
            while (dep) {
                if (dep->de_rb)
                    kmem_free((caddr_t)dep->de_rb,
                        dep->de_recsize);
                dep2 = dep->de_next;
                kmem_free((caddr_t)dep, sizeofde(dep));
                dep = dep2;
            }
            dbp1 = dbp->db_next;
            kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
            dbp = dbp1;
        }
    }
    s->s_dbp = dbhp;
    return (err);
}

static int
getoptcnt(
    mddb_set_t  *s,
    int     li)
{
    int     result;
    mddb_de_ic_t    *dep;
    mddb_db_t   *dbp;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

    result = 0;
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        dep = dbp->db_firstentry;
        for (; dep != NULL; dep = dep->de_next) {
            if (! (dep->de_flags & MDDB_F_OPT))
                continue;
            if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
                (li == dep->de_optinfo[0].o_li)) ||
                ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
                (li == dep->de_optinfo[1].o_li)))
            result++;
        }
    }
    return (result);
}

static void
getoptdev(
    mddb_set_t  *s,
    mddb_de_ic_t    *rdep,
    int     opti
)
{
    mddb_lb_t   *lbp;
    mddb_locator_t  *lp;
    mddb_optinfo_t  *otherop;
    mddb_optinfo_t  *resultop;
    int     li;
    dev_t       otherdev;
    int     blkonly = 0;
    int     mincnt;
    int     thiscnt;

    lbp = s->s_lbp;

    resultop = &rdep->de_optinfo[opti];
    otherop = &rdep->de_optinfo[1-opti];

    resultop->o_flags = 0;

    /*
     * scan through and see if data bases have to vary by only device
     */

    if (otherop->o_flags & MDDB_F_ACTIVE) {
        blkonly = 1;
        otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];
            if (! (lp->l_flags & MDDB_F_ACTIVE))
                continue;
            if (expldev(lp->l_dev) != otherdev) {
                blkonly = 0;
                break;
            }
        }
    }

    mincnt = 999999;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        dev_info_t  *devi;
        int     removable = 0;

        lp = &lbp->lb_locators[li];
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        if (otherop->o_flags & MDDB_F_ACTIVE) {
            if (blkonly) {
                if (otherop->o_li == li)
                    continue;
            } else {
                if (otherdev == expldev(lp->l_dev))
                    continue;
            }
        }

        /*
         * Check if this is a removable device.  If it is we
         * assume it is something like a USB flash disk, a zip disk
         * or even a floppy that is being used to help maintain
         * mddb quorum.  We don't want to put any optimized resync
         * records on these kinds of disks since they are usually
         * slower or don't have the same read/write lifetimes as
         * a regular fixed disk.
         */
        if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
            int     error;
            struct cb_ops   *cb;
            ddi_prop_op_t   prop_op = PROP_LEN_AND_VAL_BUF;
            int     propvalue = 0;
            int     proplength = sizeof (int);

            if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
                != NULL) {
                error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
                    prop_op, DDI_PROP_NOTPROM |
                    DDI_PROP_DONTPASS, "removable-media",
                    (caddr_t)&propvalue, &proplength);

                if (error == DDI_PROP_SUCCESS)
                    removable = 1;
            }

            ddi_release_devi(devi);
        }

        if (removable)
            continue;

        thiscnt = getoptcnt(s, li);
        if (thiscnt < mincnt) {
            resultop->o_li  = li;
            mincnt = thiscnt;
            resultop->o_flags = MDDB_F_ACTIVE;
        }
    }
}

static void
allocuserdata(
    mddb_de_ic_t    *dep
)
{
    mddb_rb32_t *rbp;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

    rbp = dep->de_rb;
    rbp->rb_private = 0;
    dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
    rbp->rb_userdata = 0x4; /* Make sure this is non-zero */
    bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
}


static void
getuserdata(
    set_t       setno,
    mddb_de_ic_t    *dep
)
{
    mddb_rb32_t  *rbp;


    mddb_type_t type = dep->de_type1;
    caddr_t     data, udata;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif
    rbp = dep->de_rb;
    data = (caddr_t)rbp->rb_data;
    udata = (caddr_t)dep->de_rb_userdata;

    /*
     * If it's a driver record, and an old style record, and not a DRL
     * record, we must convert it because it was incore as a 64 bit
     * structure but its on disk layout has only 32 bit for block sizes
     */
    if (!(md_get_setstatus(setno) &
        (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
        (type >= MDDB_FIRST_MODID) &&
        ((rbp->rb_revision == MDDB_REV_RB) ||
        (rbp->rb_revision == MDDB_REV_RBFN))) {

        switch (dep->de_flags) {

            case MDDB_F_STRIPE:
                stripe_convert(data, udata, BIG_2_SMALL);
                break;

            case MDDB_F_MIRROR:
                mirror_convert(data, udata, BIG_2_SMALL);
                break;

            case MDDB_F_RAID:
                raid_convert(data, udata, BIG_2_SMALL);
                break;

            case MDDB_F_SOFTPART:
                softpart_convert(data, udata, BIG_2_SMALL);
                break;

            case MDDB_F_TRANS_MASTER:
                trans_master_convert(data, udata, BIG_2_SMALL);
                break;

            case MDDB_F_TRANS_LOG:
                trans_log_convert(data, udata, BIG_2_SMALL);
                break;

            case MDDB_F_HOTSPARE:
                hs_convert(data, udata, BIG_2_SMALL);
                break;

            case MDDB_F_OPT:
            default:
                bcopy(udata, data, dep->de_reqsize);
        }
    } else {
        bcopy(udata, data, dep->de_reqsize);
    }
}

static void
getoptrecord(
    mddb_set_t  *s,
    mddb_de_ic_t    *dep
)
{
    mddb_lb_t   *lbp;
    mddb_locator_t  *lp;
    mddb_rb32_t *rbp, *crbp;
    int     li;
    int     i;
    int     err = 0;
    size_t      recsize;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

    lbp = s->s_lbp;

    recsize = dep->de_recsize;
    dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
    rbp = dep->de_rb;
    crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);

    dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
    dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;

    for (i = 0; i < 2; i++) {
        if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
            continue;
        li = dep->de_optinfo[i].o_li;
        lp = &lbp->lb_locators[li];

        if (! (lp->l_flags & MDDB_F_ACTIVE) ||
            (lp->l_flags & MDDB_F_EMASTER))
            continue;

        err = readblklst(s, (caddr_t)rbp, dep->de_blks,
            dep->de_blkcount, li, 0);

        if (err)
            continue;

        if (rbp->rb_magic != MDDB_MAGIC_RB)
            continue;

        if (revchk(MDDB_REV_RB, rbp->rb_revision))
            continue;

        /* Check the crc for this record */
        if (rec_crcchk(s, dep, rbp)) {
            continue;
        }

        dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;

        if (rbp == crbp) {
            if (rbp->rb_checksum != crbp->rb_checksum)
                dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
            break;
        }
        rbp = crbp;
    }

    if (rbp == crbp) {
        rbp->rb_private = 0;
        kmem_free((caddr_t)crbp, recsize);
        return;
    }
    bzero((caddr_t)rbp, recsize);
    rbp->rb_magic = MDDB_MAGIC_RB;
    rbp->rb_revision = MDDB_REV_RB;
    uniqtime32(&rbp->rb_timestamp);
    /* Generate the crc for this record */
    rec_crcgen(s, dep, rbp);
    kmem_free((caddr_t)crbp, recsize);
}

/*
 * writeoptrecord writes out an optimized record.
 */
static int
writeoptrecord(
    mddb_set_t  *s,
    mddb_de_ic_t    *dep
)
{
    mddb_rb32_t *rbp;
    int     li;
    int     err = 0, wrt_err = 0;
    mddb_bf_t   *bufhead, *bfp;
    mddb_lb_t   *lbp = s->s_lbp;
    mddb_locator_t  *lp;
    int     i;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

    bufhead = NULL;
    err = 0;

    while (s->s_opthavequeuinglck) {
        s->s_optwantqueuinglck++;
        cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
    }
    s->s_opthavequeuinglck++;
    rbp = dep->de_rb;
    for (i = 0; i < 2; i++) {
        /*
         * only possible error is xlate. This can
         * occur if a replica was off line and came
         * back. During the mean time the database grew
         * large than the now on line replica can store
         */
        if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
            continue;
        li = dep->de_optinfo[i].o_li;
        /*
         * In a MN diskset, any node can write optimized record(s).
         */
        wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
            dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
        /*
         * For MN diskset, set error in optinfo structure so
         * that mddb_commitrec knows which replica failed.
         */
        if ((MD_MNSET_SETNO(s->s_setno)) &&
            (wrt_err & MDDB_F_EWRITE)) {
            dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
        }
        err |= wrt_err;
    }
    s->s_opthavequeuinglck = 0;
    if (s->s_optwantqueuinglck) {
        s->s_optwantqueuinglck = 0;
        cv_broadcast(&s->s_optqueuing_cv);
    }
    for (bfp = bufhead; bfp; bfp = bufhead) {
        mutex_exit(SETMUTEX(s->s_setno));
        (void) biowait(&bfp->bf_buf);
        mutex_enter(SETMUTEX(s->s_setno));
        if (bfp->bf_buf.b_flags & B_ERROR) {
            /*
             * If an MN diskset, don't set replica
             * in error since this hasn't been set in master.
             * Setting replica in error before master could
             * leave the nodes with different views of the
             * world since a class 1 configuration change
             * could occur in mddb_commitrec as soon as
             * all locks are dropped.  Must keep this
             * node the same as master and can't afford a
             * failure from the class 1 config change
             * if master succeeded.
             */
            if (!(MD_MNSET_SETNO(s->s_setno))) {
                bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
            } else {
                /*
                 * Find which de_optinfo (which replica)
                 * had a failure and set the failure in
                 * the o_flags field.
                 */
                lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
                if (lp == bfp->bf_locator) {
                    dep->de_optinfo[0].o_flags |=
                        MDDB_F_EWRITE;
                } else {
                    dep->de_optinfo[1].o_flags |=
                        MDDB_F_EWRITE;
                }
            }
            err |= MDDB_F_EWRITE;
        }
        bufhead = bfp->bf_next;
        freebuffer(s, bfp);
    }
    return (err);
}

/*
 * Fix up the optimized resync record.  Used in the traditional and local
 * disksets to move an optimized record from a failed or deleted mddb
 * to an active one.
 *
 * In a MN diskset, the fixing of the optimized record is split between
 * the master and slave nodes.  If the master node moves the optimized
 * resync record, then the master node will send a MDDB_PARSE_OPTRECS
 * message to the slave nodes causing the slave nodes to reget the
 * directory entry containing the location of the optimized resync record.
 * After the record is reread from disk, then writeoptrecord is called
 * if the location of the optimized resync record or flags have changed.
 * When writeoptrecord is called, the node that is the owner of this record
 * will write the optimized record to the location specified in the directory
 * entry.  Since the master node uses the highest class message (PARSE)
 * the record owner node is guaranteed to already have an updated
 * directory entry incore.
 *
 * The other difference between the traditional/local set and MN diskset
 * is that the directory entry can be written to disk before the optimized
 * record in a MN diskset if the record is owned by a slave node.  So,
 * the users of an optimized record must handle the failure case when no
 * data is available from an optimized record since the master node could
 * have failed during the relocation of the optimized record to another mddb.
 */
static int
fixoptrecord(
    mddb_set_t  *s,
    mddb_de_ic_t    *dep,
    mddb_db_t   *dbp
)
{
    int     changed;
    int     writedata;
    int     err = 0;
    int     i;
    mddb_lb_t   *lbp;
    mddb_optinfo_t  *op;
    mddb_db32_t *db32p;
    int     rec_owner;  /* Is node owner of record? */

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

    lbp = s->s_lbp;
    changed = 0;
    writedata = 0;
    for (i = 0; i < 2; i++) {
        op = &dep->de_optinfo[i];

        if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
            op->o_flags = 0;

        /*
         * If optimized record has seen a replica failure,
         * assign new replica to record and re-write data
         * to new record.
         */
        if (! (op->o_flags & MDDB_F_ACTIVE)) {
            getoptdev(s, dep, i);
            writedata++;
            changed++;
            /* Set flag for slaves to reread dep and write rec */
            if (lbp->lb_flags & MDDB_MNSET) {
                s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
            }
        }

        /*
         * If just an error in the data was seen, set
         * the optimized record's replica flag to active (ok)
         * and try again.
         */
        if (op->o_flags & MDDB_F_EDATA) {
            dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
            writedata++;
        }
    }

    rec_owner = 0;
    if (lbp->lb_flags & MDDB_MNSET) {
        /*
         * If a MN diskset then check the owner of optimized record.
         * If the master node owns the record or if there is
         * no owner of the record, then the master can write the
         * optimized record to disk.
         * Master node can write the optimized record now, but
         * slave nodes write their records during handling of
         * the MDDB_PARSE_OPTRECS message.
         */
        if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
            (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
            rec_owner = 1;
        }
    } else {
        /*
         * In traditional diskset and local set, this node
         * is always the record owner and always the master.
         */
        rec_owner = 1;
    }

    /*
     * If this node is the record owner, write out record.
     */
    if ((writedata) && (rec_owner)) {
        if (err = writeoptrecord(s, dep)) {
            return (err);
        }
    }
    if (! changed)
        return (0);
    uniqtime32(&dbp->db_timestamp);
    dbp->db_revision = MDDB_REV_DB;
    db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
    create_db32rec(db32p, dbp);
    crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
    err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
        1, MDDB_WR_ONLY_MASTER);
    kmem_free((caddr_t)db32p, MDDB_BSIZE);
    return (err);
}

static int
fixoptrecords(
    mddb_set_t      *s
)
{
    mddb_de_ic_t    *dep;
    mddb_db_t   *dbp;
    int     err = 0;
    set_t       setno;

    /*
     * In a MN diskset, the master node is the only node that runs
     * fixoptrecords.  If the master node changes anything, then the
     * master node sends PARSE message to the slave nodes.  The slave
     * nodes will then re-read in the locator block or re-read in the
     * directory blocks and re-write the optimized resync records.
     */
    setno = s->s_setno;
    if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
        (md_set[setno].s_am_i_master == 0)) {
        return (0);
    }

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
            if (! (dep->de_flags & MDDB_F_OPT))
                continue;
            err = fixoptrecord(s, dep, dbp);
            if (err != 0)
                return (err);
        }
    }
    return (0);
}

/*
 * Checks incore version of mddb data to mddb data ondisk.
 *
 * Returns:
 *  - 0 if the data was successfully read and is good.
 *  - MDDB_F_EREAD if a read error occurred.
 *  - 1 if the data read is bad (checksum failed, etc)
 */
static int
checkcopy
(
    mddb_set_t  *s,
    int     li
)
{
    mddb_db_t   *dbp;
    mddb_db32_t *cdb32p;
    mddb_de_ic_t    *dep;
    mddb_de32_t *cde32p;
    mddb_rb32_t *rbp, *crbp;
    size_t      size;
    int     i;
    int     retval = 1;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

    if (s->s_databuffer_size == 0) {
        size_t maxrecsize = MDDB_BSIZE;

        for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
            for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
                if (! (dep->de_flags & MDDB_F_OPT) &&
                    dep->de_recsize > maxrecsize)
                    maxrecsize = dep->de_recsize;

        s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
        s->s_databuffer_size = maxrecsize;
    }

    cdb32p = (mddb_db32_t *)s->s_databuffer;

    /*
     * first go through and make sure all directory stuff
     * is the same
     */
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
            retval = MDDB_F_EREAD;
            goto err;
        }
        if (cdb32p->db32_magic != MDDB_MAGIC_DB)
            goto err;
        if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
            goto err;
        if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
            goto err;
        if (cdb32p->db32_nextblk != dbp->db_nextblk)
            goto err;
        if (cdb32p->db32_recsum != dbp->db_recsum)
            goto err;
        if (cdb32p->db32_firstentry) {
            cde32p = (mddb_de32_t *)
                ((void *)((caddr_t)(&cdb32p->db32_firstentry)
                + sizeof (cdb32p->db32_firstentry)));
        } else
            cde32p = NULL;

        dep = dbp->db_firstentry;
        /*
         * check if all directory entries are identical
         */
        while (dep && cde32p) {
            if (dep->de_recid != cde32p->de32_recid)
                goto err;
            if (dep->de_type1 != cde32p->de32_type1)
                goto err;
            if (dep->de_type2 != cde32p->de32_type2)
                goto err;
            if (dep->de_reqsize != cde32p->de32_reqsize)
                goto err;
            if (dep->de_flags != cde32p->de32_flags)
                goto err;

            for (i = 0; i < 2; i++) {
                if (dep->de_optinfo[i].o_li !=
                    cde32p->de32_optinfo[i].o_li)
                    break;
            }
            if (i != 2)
                goto err;
            size = sizeof (mddb_block_t) * dep->de_blkcount;
            if (bcmp((caddr_t)dep->de_blks,
                (caddr_t)cde32p->de32_blks, size))
                goto err;
            dep = dep->de_next;
            if (cde32p->de32_next)
                cde32p = nextentry(cde32p);
            else
                cde32p = NULL;
        }
        if (dep || cde32p)
            goto err;
    }
    /*
     * If here, all directories are functionally identical
     * check to make sure all records are identical
     * the reason the records are not just bcmped is that the
     * lock flag does not want to be compared.
     */
    crbp = (mddb_rb32_t *)cdb32p;
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
            if ((dep->de_flags & MDDB_F_OPT) ||
                (dep->de_flags & MDDB_F_CHANGELOG))
                continue;
            rbp = (mddb_rb32_t *)dep->de_rb;
            if (readblklst(s, (caddr_t)crbp, dep->de_blks,
                dep->de_blkcount, li, 0)) {
                retval = MDDB_F_EREAD;
                goto err;
            }
            /* Check the crc for this record */
            if (rec_crcchk(s, dep, crbp))
                goto err;

            if (rbp->rb_checksum != crbp->rb_checksum ||
                rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
                goto err;
        }
    }
    return (0);
err:
    return (retval);
}

/*
 * Determine if the location information for two mddbs is the same.
 * The device slice and block offset should match.  If both have devids then
 * use that for the comparison, otherwise we compare the dev_ts.
 * Comparing with the devid allows us to handle the case where a mddb was
 * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
 * the dead mddb but the devid comparison will catch this and not match.
 *
 * Return 1 if the location of the two mddbs match, 0 if not.
 */
static int
match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
    daddr32_t blkno)
{
    if (rip->ri_flags & MDDB_F_EMASTER) {
        /*
         * If this element is errored then we don't try to match on it.
         * If we try to match we could erroneously match on the dev_t
         * of a relocated disk.
         */
        return (0);
    }

    if (rip->ri_devid && devid && minor) {
        /*
         * If old devid exists, then this is a replicated diskset
         * and both old and new devids must be checked.
         */
        if (rip->ri_old_devid) {
            if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
                (ddi_devid_compare(rip->ri_old_devid,
                devid) != 0)) ||
                (strcmp(rip->ri_minor_name, minor) != 0))
                return (0);
        } else {
            if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
                strcmp(rip->ri_minor_name, minor) != 0)
                return (0);
        }
    } else {
        if (rip->ri_dev != dev)
            return (0);
    }

    if (rip->ri_blkno != blkno)
        return (0);

    return (1);
}

static int
ridev(
    mddb_ri_t   **rip,
    mddb_cfg_loc_t  *clp,
    dev32_t     *dev_2b_fixed,
    int     flag)
{
    mddb_ri_t   *r, *r1;
    md_dev64_t  ldev, ndev;
    major_t     majordev;
    int     sz;

    if (MD_UPGRADE) {
        ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
            clp->l_mnum);
    } else {
        if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
            return (EINVAL);

        ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
            clp->l_mnum);
    }

    if (clp->l_devid != 0) {
        /*
         * Get dev associated with device id and minor name.
         * Setup correct driver name if dev is now different.
         * Don't change driver name if during upgrade.
         */
        ndev = ldev;
        if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
            &ndev, clp->l_minor_name)) {
            if ((ndev != ldev) && (!(MD_UPGRADE))) {
                majordev = md_getmajor(ndev);
                (void) strcpy(clp->l_driver,
                    ddi_major_to_name(majordev));
                clp->l_mnum = md_getminor(ndev);
                clp->l_devid_flags |= MDDB_DEVID_VALID;
                ldev = ndev;
            }
        } else {
            /* Mark as invalid */
            clp->l_devid_flags &= ~MDDB_DEVID_VALID;
        }
    }

    clp->l_dev = md_cmpldev(ldev);
    if (dev_2b_fixed)
        *dev_2b_fixed = clp->l_dev;
    r = *rip;

    while (r) {
        if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
            clp->l_minor_name, ldev, clp->l_blkno)) {
            if ((clp->l_devid != 0) &&
                !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
                r->ri_flags |= MDDB_F_EMASTER;
            } else {
                r->ri_flags |= flag;
            }
            return (0); /* already entered return success */
        }
        r = r->ri_next;
    }

    /*
     * This replica not represented in the current rip list,
     * so add it to the list.
     */
    r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
    r->ri_dev = ldev;
    r->ri_blkno = clp->l_blkno;
    (void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
    if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
        r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
    }
    if (clp->l_devname != NULL) {
        (void) strcpy(r->ri_devname, clp->l_devname);
    }
    r->ri_flags |= flag;
    if (clp->l_devid != 0) {
        sz = clp->l_devid_sz;
        r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
        bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);

        if (clp->l_old_devid != NULL) {
            sz = clp->l_old_devid_sz;
            r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
                KM_SLEEP);
            bcopy((char *)(uintptr_t)clp->l_old_devid,
                (char *)r->ri_old_devid, sz);
        } else {
            r->ri_old_devid = 0;
        }
        if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
            (void) strcpy(r->ri_minor_name, clp->l_minor_name);

        if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
            /*
             * Devid is present, but not valid.  This could
             * happen if device has been powered off or if
             * the device has been removed.  Mark the device in
             * error.  Don't allow any writes to this device
             * based on the dev_t since another device could
             * have been placed in its spot and be responding to
             * the dev_t accesses.
             */
            r->ri_flags |= MDDB_F_EMASTER;
        }
    } else {
        r->ri_devid = 0;
        r->ri_old_devid = 0;
    }

    /*
     * If the rip list is empty then this entry
     * is the list.
     */
    if (*rip == NULL) {
        *rip = r;
        return (0);
    }

    /*
     * Add this entry to the end of the rip list
     */
    r1 = *rip;
    while (r1->ri_next)
        r1 = r1->ri_next;
    r1->ri_next = r;
    return (0);
}

/*
 * writecopy writes the incore data blocks out to all of the replicas.
 * This is called from writestart
 *  - when a diskset is started or
 *  - when an error has been enountered during the write to a mddb.
 * and from newdev when a new mddb is being added.
 *
 * flag can be 2 values:
 *  MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
 *      always used for traditional and local disksets.
 *      For MN diskset:
 *          All nodes can call writecopy, but only the
 *          master node actually writes data to the disk
 *          except for optimized resync records.
 *          An optimized resync record can only be written to
 *          by the record owner.
 *  MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
 *      master has been chosen, the new master may need to
 *      write its incore mddb to disk (this is the case where the
 *      old master had executed a message but hadn't relayed it
 *      to this slave yet).  New master should not write the
 *      change log records since new master would be overwriting
 *      valuable data.  Only used during a reconfig cycle.
 */
static int
writecopy(
    mddb_set_t  *s,
    int     li,
    int     flag
)
{
    mddb_db_t   *dbp;
    mddb_db32_t *db32p;
    mddb_de_ic_t    *dep;
    mddb_rb32_t *rbp;
    uint_t      checksum;
    int     err = 0;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
        create_db32rec(db32p, dbp);
        crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
        err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
            MDDB_WR_ONLY_MASTER);
        kmem_free((caddr_t)db32p, MDDB_BSIZE);
        if (err)
            return (err);
        for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
            /*
             * In a multinode diskset, when a new master is
             * chosen the new master may need to write its
             * incore copy of the mddb to disk.  In this case,
             * don't want to overwrite the change log records
             * so new master sets flag to MDDB_WRITECOPY_SYNC.
             */
            if (flag == MDDB_WRITECOPY_SYNC) {
                if (dep->de_flags & MDDB_F_CHANGELOG)
                    continue;
            }
            /*
             * In a multinode diskset, don't write out optimized
             * resync resyncs since only the mirror owner node
             * will have the correct data.  If writecopy is
             * being called from writestart as a result of
             * an mddb failure, then writestart will handle
             * the optimized records when it calls fixoptrecords.
             */
            if ((MD_MNSET_SETNO(s->s_setno)) &&
                (dep->de_flags & MDDB_F_OPT)) {
                continue;
            }

            rbp = dep->de_rb;
            checksum = rbp->rb_checksum_fiddle;
            checksum ^= rbp->rb_checksum;
            /* Generate the crc for this record */
            rec_crcgen(s, dep, rbp);
            checksum ^= rbp->rb_checksum;
            rbp->rb_checksum_fiddle = checksum;
            if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
                dep->de_blkcount, li, (mddb_bf_t **)0,
                MDDB_WR_ONLY_MASTER))
                return (err);
        }
    }
    return (0);
}

static int
upd_med(
    mddb_set_t  *s,
    char        *tag
)
{
    med_data_t  meddb;
    int     medok;
    mddb_lb_t   *lbp = s->s_lbp;
    set_t       setno = s->s_setno;
    int     li;
    int     alc;
    int     lc;


    /* If no mediator hosts, nothing to do */
    if (s->s_med.n_cnt == 0)
        return (0);

    /*
     * If this is a MN set and we are not the master, then don't
     * update mediator hosts or mark mediator as golden since
     * only master node should do that.
     */
    if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
        (md_set[setno].s_am_i_master == 0)) {
        return (0);
    }

    bzero((char *)&meddb, sizeof (med_data_t));
    meddb.med_dat_mag = MED_DATA_MAGIC;
    meddb.med_dat_rev = MED_DATA_REV;
    meddb.med_dat_fl = 0;
    meddb.med_dat_sn = setno;
    meddb.med_dat_cc = lbp->lb_commitcnt;
    TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
    crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);

    /* count accessible mediators */
    medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);

    /* count accessible and existing replicas */
    for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
        mddb_locator_t  *lp = &lbp->lb_locators[li];

        if (lp->l_flags & MDDB_F_DELETED)
            continue;

        lc++;

        if (! (lp->l_flags & MDDB_F_ACTIVE) ||
            (lp->l_flags & MDDB_F_EMASTER) ||
            (lp->l_flags & MDDB_F_EWRITE))
            continue;

        alc++;
    }

    /*
     * Mediator update quorum is >= 50%: check for less than
     * "mediator update" quorum.
     */
    if ((medok * 2) < s->s_med.n_cnt) {
        /* panic if <= 50% of all replicas are accessible */
        if ((lc > 0) && ((alc * 2) <= lc)) {
            cmn_err(CE_PANIC,
                "md: Update of 50%% of the mediator hosts failed");
            /* NOTREACHED */
        }

        cmn_err(CE_WARN,
            "md: Update of 50%% of the mediator hosts failed");
    }

    /*
     * If we have mediator update quorum and exactly 50% of the replicas
     * are accessible then mark the mediator as golden.
     */
    if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
        ((alc * 2) == lc)) {
        meddb.med_dat_fl = MED_DFL_GOLDEN;
        crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
        (void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
    }

    return (0);
}

static int
push_lb(mddb_set_t *s)
{
    mddb_lb_t   *lbp = s->s_lbp;

    /* push the change to all the replicas */
    uniqtime32(&lbp->lb_timestamp);
    if (MD_MNSET_SETNO(s->s_setno)) {
        lbp->lb_revision = MDDB_REV_MNLB;
    } else {
        lbp->lb_revision = MDDB_REV_LB;
    }
    /*
     * The updates to the mediator hosts are done
     * by the callers of this function.
     */
    return (writelocall(s));
}

/* Should not call for MN diskset since data tags are not supported */
static int
dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
{
    int         diff = 0;

    diff = (int)(odtp->dt_setno - ndtp->dt_setno);
    if (diff)
        return (diff);

    diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
    if (diff)
        return (diff);

    diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
    if (diff)
        return (diff);

    /*CSTYLED*/
    return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
}

/* Should not call for MN diskset since data tags are not supported */
static int
dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
{
    int     nextid = 0;
    mddb_dtag_lst_t **dtlpp = &s->s_dtlp;

    /* Run to the end of the list */
    for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
        if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
            return (0);
        nextid++;
    }

    /* Add the new member */
    *dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);

    /* Update the dtag portion of the list */
    bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
        sizeof (mddb_dtag_t));

    /* Fix up the id value */
    (*dtlpp)->dtl_dt.dt_id = ++nextid;

    return (0);
}

/*
 * Even though data tags are not supported in MN disksets, dt_cntl may
 * be called for a MN diskset since this routine is called even before
 * it is known the kind of diskset being read in from disk.
 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
 */
static int
dtl_cntl(mddb_set_t *s)
{
    mddb_dtag_lst_t *dtlp = s->s_dtlp;
    int     ndt = 0;

    while (dtlp != NULL) {
        ndt++;
        dtlp = dtlp->dtl_nx;
    }

    return (ndt);
}

/*
 * Even though data tags are not supported in MN disksets, dt_cntl may
 * be called for a MN diskset since this routine is called even before
 * it is known the kind of diskset being read in from disk.
 * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
 */
static mddb_dtag_t *
dtl_findl(mddb_set_t *s, int id)
{
    mddb_dtag_lst_t *dtlp = s->s_dtlp;

    while (dtlp != NULL) {
        if (dtlp->dtl_dt.dt_id == id)
            return (&dtlp->dtl_dt);
        dtlp = dtlp->dtl_nx;
    }
    return ((mddb_dtag_t *)NULL);
}

/* Should not call for MN diskset since data tags are not supported */
static void
dtl_freel(mddb_dtag_lst_t **dtlpp)
{
    mddb_dtag_lst_t *dtlp;
    mddb_dtag_lst_t *tdtlp;


    for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
        dtlp = tdtlp->dtl_nx;
        kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
    }
    *dtlpp = (mddb_dtag_lst_t *)NULL;
}

/*
 * Even though data tags are not supported in MN disksets, dt_setup will
 * be called for a MN diskset since this routine is called even before
 * it is known the kind of diskset being read in from disk.
 * Once this set is known as a MN diskset, the dtp area will be freed.
 */
static void
dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
{
    mddb_dt_t   *dtp;
    set_t       setno = s->s_setno;


    if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
        md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
    else if (dtagp == (mddb_dtag_t *)NULL)
        bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);

    /* shorthand */
    dtp = (mddb_dt_t *)md_set[setno].s_dtp;

    dtp->dt_mag = MDDB_MAGIC_DT;
    dtp->dt_rev = MDDB_REV_DT;

    if (dtagp != NULL)
        dtp->dt_dtag = *dtagp;      /* structure assignment */

    /* Initialize the setno */
    dtp->dt_dtag.dt_setno = setno;

    /* Clear the id and flags, this is only used in user land */
    dtp->dt_dtag.dt_id = 0;

    /* Checksum it */
    crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
}

/* Should not call for MN diskset since data tags are not supported */
static int
set_dtag(mddb_set_t *s, md_error_t *ep)
{
    mddb_lb_t   *lbp = s->s_lbp;
    mddb_dtag_t tag;

    if (lbp->lb_dtblkcnt == 0) {
        /* Data tags not used in a MN set - so no failure returned */
        if (lbp->lb_flags & MDDB_MNSET)
            return (0);

        cmn_err(CE_WARN,
            "No tag record allocated, unable to tag data");
        (void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
        return (1);
    }

    /* Clear the stack variable */
    bzero((caddr_t)&tag, sizeof (mddb_dtag_t));

    /* Get the HW serial number for this host */
    (void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL));
    tag.dt_sn[MDDB_SN_LEN - 1] = '\0';

    /* Get the nodename that this host goes by */
    (void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
    tag.dt_hn[MD_MAX_NODENAME] = '\0';

    /* Get a time stamp for NOW */
    uniqtime32(&tag.dt_tv);

    /* Setup the data tag record */
    dt_setup(s, &tag);

    /* Free any list of tags if they exist */
    dtl_freel(&s->s_dtlp);

    /* Put the new tag onto the tag list */
    (void) dtl_addl(s, &tag);

    return (0);
}

/*
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 * Should not call for MN diskset since data tags are not supported.
 */
static int
dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
{
    int     err = 0;
    md_dev64_t  dev;
    caddr_t     tbuf;
    daddr_t     physblk;
    mddb_block_t    blk;
    mddb_dt_t   *dtp;
    mddb_dtag_t *dtagp;
    set_t       setno = s->s_setno;

    /* If have not allocated a data tag record, there is nothing to do */
    if (lbp->lb_dtblkcnt == 0)
        return (1);

    dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);

    if (dtp == (mddb_dt_t *)NULL)
        return (1);

    /* shorthand */
    dev = md_xlate_targ_2_mini(rip->ri_dev);
    if (dev == NODEV64) {
        return (1);
    }

    tbuf = (caddr_t)rip->ri_dtp;

    for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
        physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
        err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
        /* error reading the tag */
        if (err) {
            err = 1;
            goto out;
        }
        tbuf += MDDB_BSIZE;
    }

    /* magic is valid? */
    if (dtp->dt_mag != MDDB_MAGIC_DT) {
        err = 1;
        goto out;
    }

    /* revision is valid? */
    if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
        err = 1;
        goto out;
    }

    /* crc is valid? */
    if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
        err = 1;
        goto out;
    }

    /* shorthand */
    dtagp = &dtp->dt_dtag;

    /* set number match? */
    if (dtagp->dt_setno != setno) {
        err = 1;
        goto out;
    }

    /* tag is not empty? */
    if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
        (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
        dtagp->dt_id == 0) {
        err = 2;
        goto out;
    }

    /* Mark the locator as having tagged data */
    rip->ri_flags |= MDDB_F_TAGDATA;

out:
    if (err) {
        if (err == 1) {
            md_set_setstatus(setno, MD_SET_BADTAG);
            rip->ri_flags |= MDDB_F_BADTAG;
        }
        if (dtp != NULL) {
            kmem_free(dtp, MDDB_DT_BYTES);
            rip->ri_dtp = (mddb_dt_t *)NULL;
        }
    }

    return (err);
}

/* Should not call for MN diskset since data tags are not supported */
static int
dt_write(mddb_set_t *s)
{
    int     li;
    int     err = 0;
    int     werr;
    int     empty_tag = 0;
    mddb_dtag_t *dtagp;
    mddb_dt_t   *dtp;
    mddb_lb_t   *lbp = s->s_lbp;
    set_t       setno = s->s_setno;
    uint_t      set_status = md_get_setstatus(setno);


    ASSERT(md_set[setno].s_dtp != NULL);

    /* Nowhere to write to */
    if (lbp->lb_dtblkcnt == 0)
        return (err);

    if (set_status & MD_SET_BADTAG)
        return (err);

    /* shorthand */
    dtp = (mddb_dt_t *)md_set[setno].s_dtp;
    dtagp = &dtp->dt_dtag;

    /* See if the tag is empty. */
    if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
        (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
        dtagp->dt_id == 0)
        empty_tag = 1;

    /* Write the tag to the locators and reset appropriate flags. */
    for (li = 0; li < lbp->lb_loccnt; li++) {
        mddb_locator_t  *lp = &lbp->lb_locators[li];

        if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
            (lp->l_flags & MDDB_F_DELETED) ||
            (lp->l_flags & MDDB_F_EWRITE))
            continue;

        werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
            MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);

        if (werr) {
            err |= werr;
            continue;
        }

        if (empty_tag)
            lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
        else {
            lp->l_flags |= MDDB_F_TAGDATA;
            lp->l_flags &= ~MDDB_F_BADTAG;
        }
    }

    if (err)
        return (err);


    /* If the tags were written, check to see if any tags remain. */
    for (li = 0; li < lbp->lb_loccnt; li++) {
        mddb_locator_t  *lp = &lbp->lb_locators[li];

        if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
            (lp->l_flags & MDDB_F_DELETED) ||
            (lp->l_flags & MDDB_F_EWRITE))
            continue;

        if (lp->l_flags & MDDB_F_TAGDATA)
            break;
    }

    /* If there are no tags, then clear CLRTAG and TAGDATA */
    if (li == lbp->lb_loccnt) {
        md_clr_setstatus(setno, MD_SET_CLRTAG);
        md_clr_setstatus(setno, MD_SET_TAGDATA);
    }

    return (err);
}

/* Should not call for MN diskset since data tags are not supported */
static int
dt_alloc_if_needed(mddb_set_t *s)
{
    int     i;
    int     li;
    int     moveit = 0;
    mddb_lb_t   *lbp = s->s_lbp;
    mddb_block_t    blkcnt = lbp->lb_dtblkcnt;
    set_t       setno = s->s_setno;
    uint_t      set_status = md_get_setstatus(setno);

    /*
     * If the data tag record is allocated (blkcnt != 0) and a bad tag was
     * not detected, there is nothing to do.
     */
    if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
        return (0);

    /* Bitmap not setup, checks can't be done */
    if (s->s_totalblkcnt == 0)
        return (0);

    /* While reading the tag(s) an invalid tag data record was seen */
    if (set_status & MD_SET_BADTAG)
        /* See if the invalid tag needs to be moved */
        for (i = 0; i < MDDB_DT_BLOCKS; i++)
            if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
                moveit = 1;
                break;
            }

    /* Need to move or allocate the tag data record */
    if (moveit || blkcnt == 0) {
        lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
        if (lbp->lb_dtfirstblk == 0) {
            cmn_err(CE_WARN,
                "Unable to allocate data tag record");
            return (0);
        }
        lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;

        /* Mark the locators so that they get written to disk. */
        for (li = 0; li < lbp->lb_loccnt; li++) {
            mddb_locator_t  *lp = &lbp->lb_locators[li];

            if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
                (lp->l_flags & MDDB_F_DELETED) ||
                (lp->l_flags & MDDB_F_EWRITE))
                continue;

            lp->l_flags |= MDDB_F_BADTAG;
        }
        return (1);
    }

    /*
     * Make sure the blocks are owned, since the calculation in
     * computefreeblks() is bypassed when MD_SET_BADTAG is set.
     */
    for (i = 0; i < MDDB_DT_BLOCKS; i++)
        blkbusy(s, (i + lbp->lb_dtfirstblk));

    return (1);
}

/*
 * Writestart writes the incore mddb out to all of the replicas.
 * This is called when a diskset is started and when an error has
 * been enountered during the write to a mddb.
 *
 * flag can be 2 values:
 *  MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
 *      always used for traditional and local disksets.
 *      This is the normal path for MN disksets since the slave
 *      nodes aren't actually allowed to write to disk.
 *  MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
 *      master has been chosen, the new master may need to
 *      write its incore mddb to disk (this is the case where the
 *      old master had executed a message but hadn't relayed it
 *      to this slave yet).  New master should not write the
 *      change log records since new master would be overwriting
 *      valuable data.  Only used during a reconfig cycle.
 */
static int
writestart(
    mddb_set_t  *s,
    int     flag
)
{
    int     li;
    mddb_locator_t  *lp;
    mddb_lb_t   *lbp;
    mddb_ln_t   *lnp;
    int     err = 0;
    uint_t      set_status;

    lbp = s->s_lbp;

    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        if (! (lp->l_flags & MDDB_F_SUSPECT))
            continue;
        if (writecopy(s, li, flag))
            return (1);
        lp->l_flags |= MDDB_F_UP2DATE;
    }

    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        if ((lp->l_flags & MDDB_F_UP2DATE))
            continue;
        if (checkcopy(s, li))
            if (err = writecopy(s, li, flag))
                return (1);
        lp->l_flags |= MDDB_F_UP2DATE;
    }

    /*
     * Call fixoptrecord even during a reconfig cycle since a replica
     * failure may force the master to re-assign the optimized
     * resync record to another replica.
     */
    if (fixoptrecords(s))
        return (1);

    set_status = md_get_setstatus(s->s_setno);

    /* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];

        if (lp->l_flags & MDDB_F_DELETED)
            continue;

        if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
            (lp->l_flags & MDDB_F_OLDACT) == 0) ||
            ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
            (lp->l_flags & MDDB_F_OLDACT) != 0))
            break;

        if ((set_status & MD_SET_TAGDATA) ||
            (set_status & MD_SET_CLRTAG))
            if ((lp->l_flags & MDDB_F_TAGDATA) ||
                (lp->l_flags & MDDB_F_BADTAG))
                break;
    }

    /*
     * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
     * the lbp identifier and the set identifier doesn't match.
     */
    if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {

        /* Only call for traditional and local sets */
        if (!(lbp->lb_flags & MDDB_MNSET))
            (void) dt_write(s);

        setidentifier(s, &lbp->lb_ident);

        if (err = push_lb(s)) {
            (void) upd_med(s, "writestart(0)");
            return (err);
        }

        (void) upd_med(s, "writestart(0)");

        if (err = push_lb(s)) {
            (void) upd_med(s, "writestart(1)");
            return (err);
        }

        (void) upd_med(s, "writestart(1)");

        lnp = s->s_lnp;
        uniqtime32(&lnp->ln_timestamp);
        if (lbp->lb_flags & MDDB_MNSET)
            lnp->ln_revision = MDDB_REV_MNLN;
        else
            lnp->ln_revision = MDDB_REV_LN;
        crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
        err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
            lbp->lb_lnblkcnt, 0);
        /*
         * If a MN diskset and this is the master, set the PARSE_LOCNM
         * flag in the mddb_set structure to show that the locator
         * names have changed.
         * Don't set parseflags as a result of a new master sync
         * during reconfig cycle since slaves nodes are already
         * in-sync with the new master.
         */

        if ((lbp->lb_flags & MDDB_MNSET) &&
            (md_set[s->s_setno].s_am_i_master) &&
            (flag != MDDB_WRITECOPY_SYNC)) {
            s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
        }

        if (err)
            return (err);
    }

    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        if (lp->l_flags & MDDB_F_ACTIVE) {
            lp->l_flags |= MDDB_F_OLDACT;
        } else {
            lp->l_flags &= ~MDDB_F_OLDACT;
        }
    }

    md_clr_setstatus(s->s_setno, MD_SET_STALE);

    return (0);
}

/*
 * selectreplicas selects the working replicas and may write the incore
 * version of the mddb out to the replicas ondisk.
 *
 * flag can be 3 values:
 *  MDDB_RETRYSCAN - quick scan to see if there is an error.
 *          If no new error, returns without writing mddb
 *          to disks.  If a new error is seen, writes out
 *          mddb to disks.
 *  MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
 *          out mddb to the replica ondisk.  Calls writecopy
 *          with MDDB_WRITECOPY_ALL flag which writes out
 *          all records to the replicas ondisk.
 *  MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
 *          and ondisk mddbs by writing incore values to disk.
 *          Calls writecopy with MDDB_WRITECOPY_SYNC flag so
 *          that change log records are not written out.
 *          Only used by MN disksets.
 *
 * Returns:
 *  0 - Successful
 *  1 - Unable to write incore mddb data to disk since < 50% replicas.
 */
int
selectreplicas(
    mddb_set_t  *s,
    int     flag
)
{
    int     li;
    int     alc;
    int     lc;
    mddb_locator_t  *lp;
    mddb_lb_t   *lbp = s->s_lbp;
    set_t       setno = s->s_setno;
    int     wc_flag;

    /*
     * can never transition from stale to not stale
     */
    if (md_get_setstatus(setno) & MD_SET_STALE) {
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];
            if (lp->l_flags & MDDB_F_DELETED)
                continue;
            if (! (lp->l_flags & MDDB_F_EMASTER)) {
                lp->l_flags |= MDDB_F_ACTIVE;
            } else {
                lp->l_flags &= ~MDDB_F_ACTIVE;
            }
        }
        return (1);
    }

    if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];
            if (lp->l_flags & MDDB_F_DELETED)
                continue;
            if (lp->l_flags & MDDB_F_ACTIVE) {
                lp->l_flags |= MDDB_F_OLDACT;
                lp->l_flags &= ~MDDB_F_SUSPECT;
            } else {
                lp->l_flags |= MDDB_F_SUSPECT;
                lp->l_flags &= ~MDDB_F_OLDACT;
            }

            if (! (lp->l_flags & MDDB_F_EMASTER)) {
                lp->l_flags |= MDDB_F_ACTIVE;
                lp->l_flags &= ~MDDB_F_EWRITE;
                lp->l_flags &= ~MDDB_F_TOOSMALL;
            } else {
                lp->l_flags &= ~MDDB_F_ACTIVE;
            }
        }
        computefreeblks(s); /* set up free block bits */
    } else {
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];
            if (! (lp->l_flags & MDDB_F_ACTIVE))
                continue;
            if (lp->l_flags & MDDB_F_EWRITE)
                break;
        }

        /*
         * if there are no errors this is error has already
         * been processed return current state
         */
        if (li == lbp->lb_loccnt)
            return (md_get_setstatus(setno) & MD_SET_TOOFEW);

        lp->l_flags &= ~MDDB_F_ACTIVE;
        do {
            lp = &lbp->lb_locators[li];
            lp->l_flags &= ~MDDB_F_UP2DATE;
        } while (++li < lbp->lb_loccnt);
    }

    alc = 0;
    lc = 0;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        lc++;
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        alc++;
    }

    if (alc < ((lc + 1) / 2)) {
        md_set_setstatus(setno, MD_SET_TOOFEW);
        return (1);
    }

    /* Set wc_flag based on flag passed in. */
    if (flag == MDDB_SCANALLSYNC)
        wc_flag = MDDB_WRITECOPY_SYNC;
    else
        wc_flag = MDDB_WRITECOPY_ALL;

    do {
        if (! writestart(s, wc_flag)) {
            md_clr_setstatus(setno, MD_SET_TOOFEW);
            return (0);
        }
        alc  = 0;
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];
            if ((lp->l_flags & MDDB_F_DELETED) ||
                (lp->l_flags & MDDB_F_EMASTER))
                continue;

            if (lp->l_flags & MDDB_F_EWRITE) {
                lp->l_flags &= ~MDDB_F_ACTIVE;
                lp->l_flags &= ~MDDB_F_UP2DATE;
                continue;
            }
            alc++;
        }
    } while (alc >= ((lc + 1) / 2));
    md_set_setstatus(setno, MD_SET_TOOFEW);
    return (1);
}

static int
checkstate(
    mddb_set_t  *s,
    int     probe
)
{
    int     error;
    uint_t      set_status = md_get_setstatus(s->s_setno);

    ASSERT(s != NULL);

    if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
        return (0);

    if (probe == MDDB_NOPROBE)
        return (1);

    single_thread_start(s);
    error = selectreplicas(s, MDDB_SCANALL);
    single_thread_end(s);

    if (error == 0 && s->s_zombie != 0) {
        mutex_exit(SETMUTEX(s->s_setno));
        error = mddb_deleterec(s->s_zombie);
        mutex_enter(SETMUTEX(s->s_setno));
        if (error == 0)
            s->s_zombie = 0;
    }
    return (error);
}

static int
writeretry(
    mddb_set_t  *s
)
{
    if (selectreplicas(s, MDDB_RETRYSCAN))
        if (selectreplicas(s, MDDB_SCANALL))
            return (1);
    return (0);
}

static void
free_mbipp(mddb_mb_ic_t **mbipp)
{
    mddb_mb_ic_t    *mbip1, *mbip2;

    for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
        mbip2 = mbip1->mbi_next;
        kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
    }
    *mbipp = (mddb_mb_ic_t *)NULL;
}

static mddb_ri_t *
save_rip(mddb_set_t *s)
{
    mddb_ri_t   *trip = s->s_rip;
    mddb_ri_t   *nrip = NULL;
    mddb_ri_t   **nripp = &nrip;
    mddb_ri_t   *rip;

    while (trip) {
        /* Run to the end of the list */
        for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
            /* void */;

        /* Add the new member */
        *nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);

        ASSERT(*nripp != NULL);

        /* shorthand */
        rip = *nripp;

        *rip = *trip;           /* structure assignment */

        /* Clear the stuff that is not needed for hints */
        rip->ri_flags = 0;
        rip->ri_commitcnt = 0;
        rip->ri_transplant = 0;
        rip->ri_mbip = (mddb_mb_ic_t *)NULL;
        rip->ri_dtp = (mddb_dt_t *)NULL;
        rip->ri_lbp = (mddb_lb_t *)NULL;
        rip->ri_did_icp = (mddb_did_ic_t *)NULL;
        rip->ri_devid = (ddi_devid_t)NULL;
        rip->ri_old_devid = (ddi_devid_t)NULL;
        rip->ri_next = (mddb_ri_t *)NULL;

        trip = trip->ri_next;
    }
    return (nrip);
}

static void
free_rip(mddb_ri_t **ripp)
{
    mddb_ri_t   *rip;
    mddb_ri_t   *arip;

    for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
        arip = rip->ri_next;
        if (rip->ri_devid != (ddi_devid_t)NULL) {
            ddi_devid_free(rip->ri_devid);
            rip->ri_devid = (ddi_devid_t)NULL;
        }
        if (rip->ri_old_devid != (ddi_devid_t)NULL) {
            ddi_devid_free(rip->ri_old_devid);
            rip->ri_old_devid = (ddi_devid_t)NULL;
        }
        kmem_free((caddr_t)rip, sizeof (*rip));
    }
    *ripp = (mddb_ri_t *)NULL;
}

/*
 * this routine selects the correct replica to use
 * the rules are as follows
 *  1.  if all replica has same init time select highest commit count
 *  2.  if some but not all replicas are from another hostid discard
 *      them.
 *  3.  find which init time is present is most replicas
 *  4.  discard all replicas which do not match most init times
 *  5.  select replica with highest commit count
 */

static mddb_lb_t *
selectlocator(
    mddb_set_t  *s
)
{
    mddb_ri_t   *rip = s->s_rip;
    mddb_ri_t   *r, *r1;
    mddb_lb_t   *lbp;
    struct timeval32 *tp = (struct timeval32 *)NULL;
    int     different;
    int     same;
    int     count;
    int     maxcount;
    set_t       setno = s->s_setno;
    size_t      sz;
    int     mn_set = 0;

    /* Clear the ri_transplant flag on all the rip entries. */
    /* Set ri_commitcnt to locator's commitcnt - if available */
    for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
        r->ri_transplant = 0;
        if (r->ri_lbp != (mddb_lb_t *)NULL) {
            r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
            /* If any locators have MN bit set, set flag */
            if (r->ri_lbp->lb_flags & MDDB_MNSET)
                mn_set = 1;
        }
    }

    /*
     * A data tag is being used, so use it to limit the selection first.
     * Data tags not used in MN diskset.
     */
    if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
        mddb_dt_t   *dtp = (mddb_dt_t *)md_set[setno].s_dtp;

        /*
         * now toss any locators that have a different data tag
         */
        for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
            if (r->ri_lbp == (mddb_lb_t *)NULL)
                continue;

            if (r->ri_dtp != (mddb_dt_t *)NULL) {
                /* If same tag, keep it */
                if (dtl_cmp(&dtp->dt_dtag,
                    &r->ri_dtp->dt_dtag) == 0)
                    continue;
            }

            if (r->ri_dtp != (mddb_dt_t *)NULL) {
                kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
                r->ri_dtp = (mddb_dt_t *)NULL;
            }

            mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
            if (!(md_get_setstatus(setno) &
                MD_SET_REPLICATED_IMPORT)) {
                if (r->ri_old_devid != (ddi_devid_t)NULL) {
                    sz = ddi_devid_sizeof(r->ri_old_devid);
                    kmem_free((caddr_t)r->ri_old_devid, sz);
                    r->ri_old_devid = (ddi_devid_t)NULL;
                }
            }

            kmem_free((caddr_t)r->ri_lbp,
                dbtob(r->ri_lbp->lb_blkcnt));
            r->ri_lbp = (mddb_lb_t *)NULL;

            r->ri_transplant = 1;
        }

        /* Tag used, clear the bit */
        md_clr_setstatus(s->s_setno, MD_SET_USETAG);

        if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
            /*
             * Get rid of the list of tags.
             */
            dtl_freel(&s->s_dtlp);

            /*
             * Re-create the list with the tag used.
             */
            (void) dtl_addl(s, &dtp->dt_dtag);
        }
    }

    /*
     * scan to see if all replicas have same time
     */
    for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
        if (r->ri_lbp == (mddb_lb_t *)NULL)
            continue;
        if (tp == NULL) {
            tp = &r->ri_lbp->lb_inittime;
            continue;
        }
        /* CSTYLED */
        if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
            break;
    }

    /*
     * if r == NULL then they were all them same. Choose highest
     * commit count
     */
    if (r == (mddb_ri_t *)NULL)
        goto out;

    /*
     * If here, a bogus replica is present and at least 1 lb_inittime
     * did not match.
     */

    /*
     * look and see if any but not all are from different id
     */

    different = 0;
    same = 0;
    for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
        if (r->ri_lbp == (mddb_lb_t *)NULL)
            continue;
        if (cmpidentifier(s, &r->ri_lbp->lb_ident))
            different = 1;
        else
            same = 1;
    }

    /*
     * now go through and throw out different if there are some
     * that are the same
     */
    if (different != 0 && same != 0) {
        for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
            if (r->ri_lbp == (mddb_lb_t *)NULL)
                continue;

            if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
                continue;

            if (r->ri_dtp != (mddb_dt_t *)NULL) {
                kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
                r->ri_dtp = (mddb_dt_t *)NULL;
            }

            mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
            if (!(md_get_setstatus(setno) &
                MD_SET_REPLICATED_IMPORT)) {
                if (r->ri_old_devid != (ddi_devid_t)NULL) {
                    sz = ddi_devid_sizeof(r->ri_old_devid);
                    kmem_free((caddr_t)r->ri_old_devid, sz);
                    r->ri_old_devid = (ddi_devid_t)NULL;
                }
            }

            kmem_free((caddr_t)r->ri_lbp,
                dbtob(r->ri_lbp->lb_blkcnt));
            r->ri_lbp = (mddb_lb_t *)NULL;

            r->ri_transplant = 1;
        }
    }

    /*
     * go through and pick highest. Use n square because it is
     * simple and 40 some is max possible
     */
    maxcount = 0;
    lbp = (mddb_lb_t *)NULL;
    for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
        if (r1->ri_lbp == (mddb_lb_t *)NULL)
            continue;
        count = 0;
        for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
            if (r->ri_lbp == (mddb_lb_t *)NULL)
                continue;
            if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
                &r->ri_lbp->lb_inittime, ==))
                count++;
        }
        if (count > maxcount) {
            maxcount = count;
            lbp = r1->ri_lbp;
        }
    }

    /*
     * now go though and toss any that are of a different time stamp
     */
    for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
        if (r->ri_lbp == (mddb_lb_t *)NULL)
            continue;
        if (timercmp(&lbp->lb_inittime, /* CSTYLED */
            &r->ri_lbp->lb_inittime, ==))
            continue;

        if (r->ri_dtp != (mddb_dt_t *)NULL) {
            kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
            r->ri_dtp = (mddb_dt_t *)NULL;
        }

        mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
        if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
            if (r->ri_old_devid != (ddi_devid_t)NULL) {
                sz = ddi_devid_sizeof(r->ri_old_devid);
                kmem_free((caddr_t)r->ri_old_devid, sz);
                r->ri_old_devid = (ddi_devid_t)NULL;
            }
        }

        kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
        r->ri_lbp = (mddb_lb_t *)NULL;

        r->ri_transplant = 1;
    }

out:
    /*
     * Find the locator with the highest commit count, and make it the
     * "chosen" one.
     */
    lbp = (mddb_lb_t *)NULL;
    for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
        if (r->ri_lbp == (mddb_lb_t *)NULL)
            continue;

        if (lbp == NULL) {
            lbp = r->ri_lbp;
            continue;
        }

        if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
            lbp = r->ri_lbp;
    }

    /* Toss all locator blocks, except the "chosen" one. */
    for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
        if (r->ri_lbp == (mddb_lb_t *)NULL)
            continue;

        /* Get rid of all dtp's */
        if (r->ri_dtp != (mddb_dt_t *)NULL) {
            kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
            r->ri_dtp = (mddb_dt_t *)NULL;
        }

        if (r->ri_lbp == lbp)
            continue;

        /* Get rid of extra locator devid block info */
        mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
        if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
            if (r->ri_old_devid != (ddi_devid_t)NULL) {
                sz = ddi_devid_sizeof(r->ri_old_devid);
                kmem_free((caddr_t)r->ri_old_devid, sz);
                r->ri_old_devid = (ddi_devid_t)NULL;
            }
        }

        /* Get rid of extra locators */
        kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
        r->ri_lbp = (mddb_lb_t *)NULL;
    }
    return (lbp);
}

static void
locator2cfgloc(
    mddb_lb_t       *lbp,
    mddb_cfg_loc_t      *clp,
    int         li,
    side_t          sideno,
    mddb_did_ic_t       *did_icp
)
{
    mddb_drvnm_t        *dn;
    mddb_locator_t      *lp = &lbp->lb_locators[li];
    mddb_sidelocator_t  *slp;
    mddb_mnsidelocator_t    *mnslp;
    mddb_did_info_t     *did_info;
    int             i, sz, szalloc;
    int         mn_set = 0;
    mddb_mnlb_t     *mnlbp;

    if (lbp->lb_flags & MDDB_MNSET) {
        mn_set = 1;
        mnlbp = (mddb_mnlb_t *)lbp;
        for (i = 0; i < MD_MNMAXSIDES; i++) {
            mnslp = &mnlbp->lb_mnsidelocators[i][li];
            if (mnslp->mnl_sideno == sideno)
                break;
        }
        if (i == MD_MNMAXSIDES)
            return;
    } else {
        slp = &lbp->lb_sidelocators[sideno][li];
    }

    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        did_info = &(did_icp->did_ic_blkp->blk_info[li]);
        if (did_info->info_flags & MDDB_DID_EXISTS) {
            sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
            if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
                /*
                 * copy device id from mddb to
                 * cfg_loc structure
                 */
                szalloc = clp->l_devid_sz;
                if (sz <= szalloc) {
                    for (i = 0; i < sz; i++) {
                        ((char *)(uintptr_t)
                            clp->l_devid)[i] =
                            ((char *)did_icp->
                            did_ic_devid[li])[i];
                    }
                    clp->l_devid_flags |= MDDB_DEVID_VALID;
                    (void) strcpy(clp->l_minor_name,
                        did_info->info_minor_name);
                } else {
                    clp->l_devid_flags |=
                        MDDB_DEVID_NOSPACE;
                }
            } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
                clp->l_devid_flags = MDDB_DEVID_SZ;
                clp->l_devid_sz = sz;
            }
        }
    }

    /*
     * Even if a devid exists, use the dev, drvnm and mnum in the locators
     * and sidelocators.  During startup, the dev, drvnm and mnum in
     * these structures may not match the devid (the locators and
     * sidelocators will be updated to match the devid by the routine
     * load_old_replicas).  Using out-of-sync values won't cause any
     * problems since ridev will re-derive these from the devid and mnum.
     * After startup, the dev, drvnm and mnum in these structures have
     * been updated and can be used.
     */

    clp->l_blkno = lp->l_blkno;
    clp->l_flags = lp->l_flags;
    clp->l_dev = lp->l_dev;

    if (mn_set) {
        dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
        clp->l_mnum = mnslp->mnl_mnum;
    } else {
        dn = &lbp->lb_drvnm[slp->l_drvnm_index];
        clp->l_mnum = slp->l_mnum;
    }
    (void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
}

/*
 * Find the index into the mnsidelocator where entry will go.
 * Then index can be fed into both splitname2locatorblocks and
 * cfgloc2locator so that those entries can be kept in sync.
 *
 * Returns:
 *  -1 if failed to find unused slot or if a traditional diskset
 *  index, if successful  (0 <= index <= MD_MNMAXSIDES)
 */
static int
checklocator(
    mddb_lb_t       *lbp,
    int         li,
    side_t          sideno
)
{
    uchar_t         i;
    mddb_mnsidelocator_t    *mnslp;
    mddb_mnlb_t     *mnlbp;
    int         index = -1;

    if (lbp->lb_flags & MDDB_MNSET) {
        /*
         * Checking side locator structure.  First, check if
         * there is already an entry for this side.  If so,
         * then use that entry.  Otherwise, find an entry
         * that has a sideno of 0.
         */
        mnlbp = (mddb_mnlb_t *)lbp;
        for (i = 0; i < MD_MNMAXSIDES; i++) {
            mnslp = &mnlbp->lb_mnsidelocators[i][li];
            if (mnslp->mnl_sideno == sideno) {
                /* Found a match - stop looking */
                index = i;
                break;
            } else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
                /* Set first empty slot, but keep looking */
                index = i;
            }
        }
        /* Didn't find empty slot or previously used slot */
        if ((i == MD_MNMAXSIDES) && (index == -1)) {
            return (-1);
        }
        return (index);
    } else
        return (0);
}

/*
 * Takes locator information (driver name, minor number, sideno) and
 * stores it in the locator block.
 * For traditional diskset, the sideno is the index into the sidelocator
 * array in the locator block.
 * For the MN diskset, the sideno is the nodeid which can be any number,
 * so the index passed in is the index into the mnsidelocator array
 * in the locator block.
 */
static int
cfgloc2locator(
    mddb_lb_t       *lbp,
    mddb_cfg_loc_t      *clp,
    int         li,
    side_t          sideno,
    int         index   /* Only useful in MNsets when > 1 */
)
{
    uchar_t         i;
    mddb_sidelocator_t  *slp;
    mddb_mnsidelocator_t    *mnslp;
    mddb_set_t      *s;
    int         mn_set = 0;
    mddb_mnlb_t     *mnlbp;

    if (lbp->lb_flags & MDDB_MNSET) {
        mnlbp = (mddb_mnlb_t *)lbp;
        mn_set = 1;
        /*
         * Index will be the slot that has the given sideno or
         * the first empty slot if no match is found.
         * This was pre-checked out in check locator.
         */
        mnslp = &mnlbp->lb_mnsidelocators[index][li];
    } else {
        slp = &lbp->lb_sidelocators[sideno][li];
    }

    /*
     * Look for the driver name
     */
    for (i = 0; i < MDDB_DRVNMCNT; i++) {
        if (lbp->lb_drvnm[i].dn_len == 0)
            continue;
        if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
            MD_MAXDRVNM) == 0)
            break;
    }

    /*
     * Didn't find one, add a new one
     */
    if (i == MDDB_DRVNMCNT) {
        for (i = 0; i < MDDB_DRVNMCNT; i++) {
            if (lbp->lb_drvnm[i].dn_len == 0)
                break;
        }
        if (i == MDDB_DRVNMCNT)
            return (1);
        (void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
            MD_MAXDRVNM);
        lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
    }

    /* Fill in the drvnm index */
    if (mn_set) {
        mnslp->mnl_drvnm_index = i;
        mnslp->mnl_mnum = clp->l_mnum;
        mnslp->mnl_sideno = sideno;
    } else {
        slp->l_drvnm_index = i;
        slp->l_mnum = clp->l_mnum;
    }

    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        /*
         * This device id could already be associated with this index
         * if this is not the first side added to the set.
         * If device id is 0, there is no device id for this device.
         */
        if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
            return (0);
        s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
        if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
            clp->l_minor_name)) {
            return (1);
        }
    }

    return (0);
}

/*
 * See if there are mediator hosts and try to use the data.
 */
static int
mediate(
    mddb_set_t  *s
)
{
    mddb_lb_t   *lbp = s->s_lbp;
    med_data_lst_t  *meddlp = NULL;
    med_data_lst_t  *tmeddlp = NULL;
    med_data_t  *meddp;
    int     medok = 0;
    int     medacc = 0;
    uint_t      maxcc;
    int     golden = 0;
    int     err = 1;
    set_t       setno = s->s_setno;

    /* Do not have a mediator, then the state is stale */
    if (s->s_med.n_cnt == 0)
        return (err);

    /* Contact the mediator hosts for the data */
    meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);

    /* No mediator data, stale */
    if (meddlp == NULL)
        return (err);

    /* Mark all the mediator data that is not for this set as errored */
    for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
        struct timeval32 tmptime;
        meddp = tmeddlp->mdl_med;

        /* Count the number of mediators contacted */
        medacc++;

        /* Paranoid check */
        if (meddp->med_dat_sn != setno)
            meddp->med_dat_fl |= MED_DFL_ERROR;

        TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);

        /*CSTYLED*/
        if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
            meddp->med_dat_fl |= MED_DFL_ERROR;
    }

    /* Get the max commitcount */
    maxcc = 0;
    for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
        meddp = tmeddlp->mdl_med;
        if (meddp->med_dat_fl & MED_DFL_ERROR)
            continue;
        if (meddp->med_dat_cc > maxcc)
            maxcc = meddp->med_dat_cc;
    }

    /* Now mark the records that don't have the highest cc as errored */
    for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
        meddp = tmeddlp->mdl_med;
        if (meddp->med_dat_fl & MED_DFL_ERROR)
            continue;
        if (meddp->med_dat_cc != maxcc)
            meddp->med_dat_fl |= MED_DFL_ERROR;
    }

    /* Now mark the records that don't match the lb commitcnt as errored */
    for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
        meddp = tmeddlp->mdl_med;
        if (meddp->med_dat_fl & MED_DFL_ERROR)
            continue;
        if (meddp->med_dat_cc != lbp->lb_commitcnt)
            meddp->med_dat_fl |= MED_DFL_ERROR;
    }

    /* Is there a "golden" copy and how many valid mediators */
    for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
        meddp = tmeddlp->mdl_med;
        if (meddp->med_dat_fl & MED_DFL_ERROR)
            continue;

        if (meddp->med_dat_fl & MED_DFL_GOLDEN)
            golden++;

        medok++;
    }

    /* No survivors, stale */
    if (medok == 0)
        goto out;

    /* No mediator quorum and no golden copies, stale */
    if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
        /* Skip odd numbers, no exact 50% */
        if (s->s_med.n_cnt & 1)
            goto out;
        /* Have 50%, allow an accept */
        if (medacc == (s->s_med.n_cnt / 2))
            md_set_setstatus(setno, MD_SET_ACCOK);
        goto out;
    }

    /* We either have a quorum or a golden copy, or both */
    err = 0;

out:
    if (meddlp) {
        for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
            tmeddlp = meddlp->mdl_nx;
            kmem_free(meddlp->mdl_med, sizeof (med_data_t));
            kmem_free(meddlp, sizeof (med_data_lst_t));
        }
    }

    return (err);
}

/*
 *  1. read masterblks and locator blocks for all know database locations
 *      a. keep track of which have good master blks
 *      b. keep track of which have good locators
 *
 */
static int
get_mbs_n_lbs(
    mddb_set_t  *s,
    int     *write_lb
)
{
    mddb_lb_t   *lbp = NULL;        /* pointer to locator block */
                        /* May be cast to mddb_mnlb_t */
                        /* if accessing sidenames in */
                        /* MN set */
    mddb_did_ic_t   *did_icp = NULL;    /* ptr to Device ID incore */
    mddb_did_blk_t  *did_blkp = 0;
    int     did_blkp_sz = 0;
    mddb_did_db_t   *did_dbp;
    mddb_did_info_t *did_info;
    caddr_t     did_block;
    mddb_ri_t   *rip;
    mddb_dtag_lst_t *dtlp;
    mddb_locator_t  *lp;
    daddr_t     physblk;
    int     li;
    uint_t      blk;
    md_dev64_t  dev;
    caddr_t     buffer;
    uint_t      lb_blkcnt;
    int     retval = 0;
    int     err = 0;
    int     lb_ok = 0;
    int     lb_total = 0;
    int     lb_tagged = 0;
    int     lb_tags;
    set_t       setno = s->s_setno;
    int     cont_flag, i;
    mddb_did_db_t   *did_dbp1, *did_dbp2;
    int     mn_set = 0;
    mddb_cfg_loc_t  *cl;

    /*
     * read in master blocks and locator block for all known locators.
     * lb_blkcnt will be set correctly for MN set later once getmasters
     * has determined that the set is a MN set.
     */
    lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);

    for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
        rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
            MDDB_F_EMASTER);
        rip->ri_lbp = (mddb_lb_t *)NULL;
        rip->ri_did_icp = (mddb_did_ic_t *)NULL;

        /*
         * Translated dev is only used in calls to getmasters and
         * getblks which expect a translated (aka miniroot) dev.
         */
        dev = md_xlate_targ_2_mini(rip->ri_dev);
        if (dev == NODEV64) {
            /* Set error flag that getmasters would have set */
            /* if getmasters had been allowed to fail */
            rip->ri_flags |= MDDB_F_EMASTER;
        }

        /*
         * Invalid device id on system (due to failed or
         * removed device) or invalid devt during upgrade
         * (due to powered off device) will cause this
         * replica to be marked in error and not used.
         */
        if (rip->ri_flags & MDDB_F_EMASTER)
            continue;

        /* get all master blocks, does mddb_devopen() */
        rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
            &rip->ri_flags, &mn_set);

        /* if invalid master block - try next replica */
        if (! rip->ri_mbip)
            continue;

        /*
         * If lbp alloc'd to wrong size - reset it.
         * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
         * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
         */
        if (lbp) {
            if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
                ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
                kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
                lbp = (mddb_lb_t *)NULL;
            }
        }

        if (lbp == (mddb_lb_t *)NULL) {
            /* If a MN set, set lb_blkcnt for MN loc blk size */
            if (mn_set)
                lb_blkcnt = MDDB_MNLBCNT;
            lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
                KM_SLEEP);
        }

        /*
         * Read in all the sectors for the locator block
         * NOTE: Need to use getblks, rather than readblklst.
         *  because it is too early and things are
         *  NOT set up yet for read*()'s
         */
        buffer = (caddr_t)lbp;
        for (blk = 0; blk < lb_blkcnt; blk++) {
            physblk = getphysblk(blk, rip->ri_mbip);
            err = getblks(s, buffer, dev, physblk,
                btodb(MDDB_BSIZE), 0);
            if (err) {
                rip->ri_flags |= err;
                break;
            }
            buffer += MDDB_BSIZE;
        }

        if (err)
            continue;

        /* Verify the locator block */
        if (blk != lb_blkcnt)
            continue;
        if (lbp->lb_magic != MDDB_MAGIC_LB)
            continue;
        if (lbp->lb_blkcnt != lb_blkcnt)
            continue;
        if (mn_set) {
            /* If a MN set, check for MNLB revision in lb. */
            if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
                continue;
        } else {
            /* If not a MN set, check for LB revision in lb. */
            if (revchk(MDDB_REV_LB, lbp->lb_revision))
                continue;
        }
        if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
            continue;

        /*
         * With the addition of MultiNode Disksets, we must make sure
         * to verify that this is the correct set.  A node could
         * have been out of the config for awhile and this disk could
         * have been moved to a different diskset and we don't want
         * to accidentally start the wrong set.
         *
         * We don't do this check if we're in the middle of
         * importing a set.
         */
        if (!(md_get_setstatus(s->s_setno) &
            (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
            (lbp->lb_setno != s->s_setno))
            continue;

        rip->ri_flags |= MDDB_F_LOCACC;

        /*
         * a commit count of zero means this locator has been deleted
         */
        if (lbp->lb_commitcnt == 0)
            continue;

        /*
         * If replica is in the device ID style and md_devid_destroy
         * flag is set, turn off device id style.  This is only to be
         * used in a catastrophic failure case.  Examples would be
         * where the device id of all drives in the system
         * (especially the mirror'd root drives) had been changed
         * by firmware upgrade or by a patch to an existing disk
         * driver.  Another example would be in the case of non-unique
         * device ids due to a bug.  The device id would be valid on
         * the system, but would return the wrong dev_t.
         */
        if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
            lbp->lb_flags &= ~MDDB_DEVID_STYLE;
            lbp->lb_didfirstblk = 0;
            lbp->lb_didblkcnt = 0;
            *write_lb = 1;
        }


        /*
         * If replica is in device ID style, read in device ID
         * block and verify device ID block information.
         */
        if (lbp->lb_flags & MDDB_DEVID_STYLE) {

            /* Read in device ID block */
            if (did_icp == NULL) {
                did_icp = (mddb_did_ic_t *)
                    kmem_zalloc(sizeof (mddb_did_ic_t),
                    KM_SLEEP);
            } else {
                /* Reuse did_icp, but clear out data */
                if (did_icp->did_ic_blkp !=
                    (mddb_did_blk_t *)NULL) {
                    kmem_free((caddr_t)did_icp->did_ic_blkp,
                        did_blkp_sz);
                    did_blkp = (mddb_did_blk_t *)NULL;
                    did_icp->did_ic_blkp =
                        (mddb_did_blk_t *)NULL;
                }
                if (did_icp->did_ic_dbp !=
                    (mddb_did_db_t *)NULL) {
                    did_dbp1 = did_icp->did_ic_dbp;
                    while (did_dbp1) {
                        did_dbp2 = did_dbp1->db_next;
                        kmem_free((caddr_t)
                            did_dbp1->db_ptr,
                            dbtob(did_dbp1->db_blkcnt));
                        kmem_free((caddr_t)did_dbp1,
                            sizeof (mddb_did_db_t));
                        did_dbp1 = did_dbp2;
                    }
                    did_icp->did_ic_dbp =
                        (mddb_did_db_t *)NULL;
                }
                for (i = 0; i < MDDB_NLB; i++) {
                    did_icp->did_ic_devid[i] =
                        (ddi_devid_t)NULL;
                }
            }

            /* Can't reuse blkp since size could be different */
            if (did_blkp != (mddb_did_blk_t *)NULL) {
                kmem_free(did_blkp, did_blkp_sz);
            }
            did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
            did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
                KM_SLEEP);
            did_icp->did_ic_blkp = did_blkp;
            buffer = (caddr_t)did_blkp;
            for (blk = lbp->lb_didfirstblk;
                blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
                blk++) {
                physblk = getphysblk(blk, rip->ri_mbip);
                err = getblks(s, buffer, dev, physblk,
                    btodb(MDDB_BSIZE), 0);
                if (err) {
                    rip->ri_flags |= err;
                    break;
                }
                buffer += MDDB_BSIZE;
            }
            if (err)
                continue;

            /* Verify the Device ID block */
            if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
                continue;
            if (did_blkp->blk_magic != MDDB_MAGIC_DI)
                continue;
            if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
                continue;
            if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
                continue;
            if (crcchk(did_blkp, &did_blkp->blk_checksum,
                dbtob(lbp->lb_didblkcnt), NULL))
                continue;

            /*
             * Check if device ID block is out of sync with the
             * Locator Block by checking if the locator block
             * commitcnt does not match the device id block
             * commitcnt.  If an 'out of sync' condition
             * exists, discard this replica since it has
             * inconsistent data and can't be used in
             * determining the best replica.
             *
             * An 'out of sync' condition could happen if old
             * SDS code was running with new devid style replicas
             * or if a failure occurred between the writing of
             * the locator block's commitcnt and the device
             * id block's commitcnt.
             *
             * If old SDS code had been running, the upgrade
             * process should detect this situation and
             * have removed all of the device id information
             * via the md_devid_destroy flag in md.conf.
             */
            if (did_blkp->blk_commitcnt !=
                lbp->lb_commitcnt) {
                continue;
            }
        }


        /*
         * If replica is still in device ID style, read in all
         * of the device IDs, verify the checksum of the device IDs.
         */
        if (lbp->lb_flags & MDDB_DEVID_STYLE) {
            /*
             * Reset valid bit in device id info block flags. This
             * flag is stored on disk, but the valid bit is reset
             * when reading in the replica.  If the corresponding
             * device id is valid (aka meaning that the system
             * knows about this device id), the valid bit will
             * be set at a later time.  The valid bit for this
             * replica's device ID will be set in this routine.
             * The valid bits for the rest of the device id's
             * will be set after the 'best' replica has
             * been selected in routine load_old_replicas.
             * Reset updated bit in device id info block flags.
             * This flag is also stored on disk, reset when read
             * in and set when the locators and side locators
             * have been updated to match this valid device
             * id information.
             */
            for (li = 0; li < lbp->lb_loccnt; li++) {
                did_info = &did_blkp->blk_info[li];
                if (did_info->info_flags & MDDB_DID_EXISTS)
                    did_info->info_flags &=
                        ~(MDDB_DID_VALID |
                        MDDB_DID_UPDATED);
            }

            cont_flag = 0;
            for (li = 0; li < lbp->lb_loccnt; li++) {
                did_info = &did_blkp->blk_info[li];
                did_block = (caddr_t)NULL;
                if (did_info->info_flags & MDDB_DID_EXISTS) {
                    /*
                     * Check if block has
                     * already been read in
                     */
                    did_dbp = did_icp->did_ic_dbp;
                    while (did_dbp != 0) {
                        if (did_dbp->db_firstblk ==
                            did_info->info_firstblk)
                            break;
                        else
                            did_dbp =
                                did_dbp->db_next;
                    }
                    /* if block not found, read it in */
                    if (did_dbp == NULL) {
                        did_block = (caddr_t)
                            (kmem_zalloc(dbtob(
                            did_info->info_blkcnt),
                            KM_SLEEP));
                        buffer = (caddr_t)did_block;
                        for (blk =
                            did_info->info_firstblk;
                            blk < (did_info->
                            info_firstblk +
                            did_info->info_blkcnt);
                            blk++) {
                            physblk =
                                getphysblk(blk,
                                rip->ri_mbip);
                            err = getblks(s,
                                buffer, dev,
                                physblk, btodb(
                                MDDB_BSIZE), 0);
                            if (err) {
                                rip->ri_flags |=
                                    err;
                                break;
                            }
                            buffer += MDDB_BSIZE;
                        }
                        if (err) {
                            kmem_free(did_block,
                                dbtob(did_info->
                                info_blkcnt));
                            did_block =
                                (caddr_t)NULL;
                            cont_flag = 1;
                            break;
                        }

                        /*
                         * Block read in -
                         * alloc Disk Block area
                         */
                        did_dbp = (mddb_did_db_t *)
                            kmem_zalloc(
                            sizeof (mddb_did_db_t),
                            KM_SLEEP);
                        did_dbp->db_ptr = did_block;
                        did_dbp->db_firstblk =
                            did_info->info_firstblk;
                        did_dbp->db_blkcnt =
                            did_info->info_blkcnt;

                        /* Add to front of dbp list */
                        did_dbp->db_next =
                            did_icp->did_ic_dbp;
                        did_icp->did_ic_dbp = did_dbp;
                    }
                    /* Check validity of devid in block */
                    if (crcchk(((char *)did_dbp->db_ptr +
                        did_info->info_offset),
                        &did_info->info_checksum,
                        did_info->info_length, NULL)) {
                        cont_flag = 1;
                        break;
                    }

                    /* Block now pointed to by did_dbp */
                    did_icp->did_ic_devid[li] =
                        (ddi_devid_t)((char *)
                        did_dbp->db_ptr +
                        did_info->info_offset);
                }
            }
            if (cont_flag)
                continue;
        }

        /*
         * All blocks containing devids are now in core.
         */

        /*
         * If we're doing a replicated import (also known as
         * remote copy import), the device id in the locator
         * block is incorrect and we need to fix it up here
         * alongwith the l_dev otherwise we run into lots of
         * trouble later on.
         */
        if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
            mddb_ri_t   *trip;
            for (li = 0; li < lbp->lb_loccnt; li++) {
                did_info = &did_blkp->blk_info[li];
                lp = &lbp->lb_locators[li];

                if (lp->l_flags & MDDB_F_DELETED)
                    continue;

                if (!(did_info->info_flags & MDDB_DID_EXISTS))
                    continue;

                if (did_icp->did_ic_devid[li] == NULL)
                    continue;

                for (trip = s->s_rip; trip != NULL;
                    trip = trip->ri_next) {
                    if (trip->ri_old_devid == NULL)
                        continue;
                    if (ddi_devid_compare(
                        trip->ri_old_devid,
                        did_icp->did_ic_devid[li]) != 0) {
                        continue;
                    }

                    /* update l_dev and side mnum */
                    lp->l_dev = md_cmpldev(trip->ri_dev);
                    lbp->lb_sidelocators[0][li].l_mnum =
                        md_getminor(trip->ri_dev);
                }
            }
        }

        /*
         * If there is a valid devid, verify that this locator
         * block has information about itself by checking the
         * device ID, minor_name and block
         * number from this replica's incore data structure
         * against the locator block information that has just
         * been read in from disk.
         *
         * If not a valid devid, verify that this locator block
         * has information about itself by checking the minor
         * number, block number and driver name from this
         * replica's incore data structure against the locator
         * block information that has just been read in from disk.
         */
        if ((rip->ri_devid != NULL) &&
            (lbp->lb_flags & MDDB_DEVID_STYLE)) {
            /*
             * This locator block MUST have locator (replica)
             * information about itself.  Check against devid,
             * slice part of minor number, and block number.
             */
            for (li = 0; li < lbp->lb_loccnt; li++) {
                did_info = &did_blkp->blk_info[li];
                lp = &lbp->lb_locators[li];
                if (lp->l_flags & MDDB_F_DELETED)
                    continue;

                if (!(did_info->info_flags & MDDB_DID_EXISTS))
                    continue;

                if (((md_get_setstatus(setno) &
                    MD_SET_REPLICATED_IMPORT)) &&
                    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
                    if (ddi_devid_compare(rip->ri_old_devid,
                        did_icp->did_ic_devid[li]) != 0)
                        continue;
                } else {
                    if (ddi_devid_compare(rip->ri_devid,
                        did_icp->did_ic_devid[li]) != 0)
                        continue;
                }

                if (strcmp(rip->ri_minor_name,
                    did_info->info_minor_name) != 0)
                    continue;

                if (lp->l_blkno == rip->ri_blkno)
                    break;
            }
        } else {
            /*
             * This locator block MUST have locator (replica)
             * information about itself.
             */
            if (!mn_set) {
                for (li = 0; li < lbp->lb_loccnt; li++) {
                    mddb_drvnm_t        *dn;
                    mddb_sidelocator_t  *slp;

                    lp = &lbp->lb_locators[li];
                    slp = &lbp->
                        lb_sidelocators[s->s_sideno][li];
                    if (lp->l_flags & MDDB_F_DELETED)
                        continue;
                    if (slp->l_mnum != md_getminor(
                        rip->ri_dev))
                        continue;
                    if (lp->l_blkno != rip->ri_blkno)
                        continue;
                    dn = &lbp->lb_drvnm[slp->l_drvnm_index];
                    if (strncmp(dn->dn_data,
                        rip->ri_driver, MD_MAXDRVNM) == 0)
                        break;
                }
            } else {
                for (li = 0; li < lbp->lb_loccnt; li++) {
                    mddb_drvnm_t        *dn;
                    mddb_mnsidelocator_t    *mnslp;
                    mddb_mnlb_t     *mnlbp;
                    int         i;

                    /*
                     * Check all possible locators locking
                     * for match to the currently read-in
                     * locator, must match on:
                     *  - blkno
                     *  - side locator for this
                     *    node's side
                     *  - side locator minor number
                     *  - side locator driver name
                     */

                    /*
                     * Looking at sidelocs:
                     * cast lbp -> mnlbp
                     */
                    mnlbp = (mddb_mnlb_t *)lbp;
                    lp = &mnlbp->lb_locators[li];
                    if (lp->l_flags & MDDB_F_DELETED)
                        continue;
                    if (lp->l_blkno != rip->ri_blkno)
                        continue;

                    for (i = 0; i < MD_MNMAXSIDES; i++) {
                        mnslp = &mnlbp->
                            lb_mnsidelocators[i][li];
                        if (mnslp->mnl_sideno ==
                            s->s_sideno) {
                            break;
                        }
                    }
                    /* No matching side found */
                    if (i == MD_MNMAXSIDES)
                        continue;
                    if (mnslp->mnl_mnum !=
                        md_getminor(rip->ri_dev))
                        continue;
                    dn = &lbp->
                        lb_drvnm[mnslp->mnl_drvnm_index];
                    if (strncmp(dn->dn_data,
                        rip->ri_driver, MD_MAXDRVNM) == 0)
                        break;
                }
            }
        }

        /*
         * Didn't find ourself in this locator block it means
         * the locator block is a stale transplant. Probably from
         * a user doing a dd.
         */
        if (li == lbp->lb_loccnt)
            continue;

        /*
         * Keep track of the number of accessed and valid
         * locator blocks.
         */
        lb_ok++;

        /*
         * Read the tag in, skips invalid or blank tags.
         * Only valid tags allocate storage
         * Data tags are not used in MN disksets.
         */
        if ((!mn_set) && (! dt_read(s, lbp, rip))) {
            /*
             * Keep track of the number of tagged
             * locator blocks.
             */
            lb_tagged++;

            /* Keep a list of unique tags. */
            (void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
        }

        if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
            /*
             * go through locator block and add any other
             * locations of the data base.
             * For the replicated import case, this was done earlier
             * and we really don't need or want to do so again
             */
            cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
            for (li = 0; li < lbp->lb_loccnt; li++) {
                lp = &lbp->lb_locators[li];
                if (lp->l_flags & MDDB_F_DELETED)
                    continue;

                cl->l_devid_flags = MDDB_DEVID_GETSZ;
                cl->l_devid = (uint64_t)0;
                cl->l_devid_sz = 0;
                cl->l_old_devid = (uint64_t)0;
                cl->l_old_devid_sz = 0;
                cl->l_minor_name[0] = '\0';
                locator2cfgloc(lbp, cl, li, s->s_sideno,
                    did_icp);

                if (cl->l_devid_flags & MDDB_DEVID_SZ) {
                    if ((cl->l_devid = (uintptr_t)kmem_alloc
                        (cl->l_devid_sz, KM_SLEEP))
                        == NULL) {
                        continue;
                    } else {
                        cl->l_devid_flags =
                            MDDB_DEVID_SPACE;
                    }
                }
                locator2cfgloc(lbp, cl, li, s->s_sideno,
                    did_icp);

                (void) ridev(&s->s_rip, cl, &lp->l_dev, 0);

                if (cl->l_devid_flags & MDDB_DEVID_SPACE)
                    kmem_free((caddr_t)(uintptr_t)
                        cl->l_devid, cl->l_devid_sz);
            }
            kmem_free(cl, sizeof (mddb_cfg_loc_t));
        }

        /* Save LB for later */
        rip->ri_lbp = lbp;
        if (lbp->lb_flags & MDDB_DEVID_STYLE) {
            rip->ri_did_icp = did_icp;
            did_icp = (mddb_did_ic_t *)NULL;
            did_blkp = (mddb_did_blk_t *)NULL;
        } else
            rip->ri_did_icp = NULL;
        lbp = (mddb_lb_t *)NULL;
    }

    if (lbp != (mddb_lb_t *)NULL)
        kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));

    if (did_icp != (mddb_did_ic_t *)NULL) {
        if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
            kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
            did_blkp = (mddb_did_blk_t *)NULL;
        }
        if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
            mddb_did_db_t   *did_dbp1, *did_dbp2;

            did_dbp1 = did_icp->did_ic_dbp;
            while (did_dbp1) {
                did_dbp2 = did_dbp1->db_next;
                kmem_free((caddr_t)did_dbp1->db_ptr,
                    dbtob(did_dbp1->db_blkcnt));
                kmem_free((caddr_t)did_dbp1,
                    sizeof (mddb_did_db_t));
                did_dbp1 = did_dbp2;
            }
        }
        kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
    }

    if (did_blkp != (mddb_did_blk_t *)NULL) {
        kmem_free((caddr_t)did_blkp, did_blkp_sz);
    }

    /* No locator blocks were ok */
    if (lb_ok == 0)
        goto out;

    /* No tagged data was found - will be 0 for MN diskset */
    if (lb_tagged == 0)
        goto out;

    /* Find the highest non-deleted replica count */
    for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
        int     lb_tot = 0;

        if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
            continue;

        if (rip->ri_lbp == (mddb_lb_t *)NULL)
            continue;

        for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
            lp = &rip->ri_lbp->lb_locators[li];
            if (lp->l_flags & MDDB_F_DELETED)
                continue;
            lb_tot++;
        }

        if (lb_tot > lb_total)
            lb_total = lb_tot;
    }

    /* Count the number of unique tags */
    for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
        lb_tags++;

    /* Should have at least one tag at this point */
    ASSERT(lb_tags > 0);


    /*
     * If the number of tagged locators is not the same as the number of
     * OK locators OR more than one tag exists, then make sure the
     * selected tag will be written out later.
     */
    if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
        md_set_setstatus(setno, MD_SET_TAGDATA);

    /* Only a single tag, take the tagged data */
    if (lb_tags == 1) {
        dt_setup(s, &s->s_dtlp->dtl_dt);
        md_set_setstatus(setno, MD_SET_USETAG);
        goto out;
    }

    /* Multiple tags, not selecting a tag, tag mode is on */
    if (! (md_get_setstatus(setno) & MD_SET_USETAG))
        retval = MDDB_E_TAGDATA;

out:

    return (retval);
}

/*
 *  1. Select a locator.
 *  2. check if enough locators now have current copies
 *  3. read in database from one of latest
 *  4. if known to have latest make all database the same
 *  5. if configuration has changed rewrite locators
 *
 * Parameters:
 *  s - pointer to mddb_set structure
 *  flag - used in MN disksets to tell if this node is being joined to
 *      a diskset that is in the STALE state.  If the flag is
 *      MDDB_MN_STALE, then this node should be marked in the STALE
 *      state even if > 50% mddbs are available.  (The diskset can
 *      only change from STALE->OK if all nodes withdraw from the
 *      MN diskset and then rejoin).
 */
static int
load_old_replicas(
    mddb_set_t  *s,
    int     flag
)
{
    mddb_lb_t   *lbp = NULL;
    mddb_mnlb_t *mnlbp = NULL;
    mddb_ri_t   *rip;
    mddb_locator_t  *lp;
    mddb_db_t   *dbp;
    mddb_de_ic_t    *dep;
    int     li;
    int     alc;
    int     lc;
    int     tlc;
    int     retval = 0;
    caddr_t     p;
    size_t      maxrecsize;
    set_t       setno = s->s_setno;
    mddb_did_db_t   *did_dbp1;
    mddb_did_info_t *did_info;
    mddb_did_ic_t   *did_icp = NULL;
    md_dev64_t  *newdev;
    mddb_sidelocator_t  *slp = 0;
    mddb_mnsidelocator_t    *mnslp = 0;
    uchar_t     i;
    char        *name;
    ddi_devid_t ret_devid;
    md_dev64_t  dev;
    uint_t      len, sz;
    char        *minor_name;
    int     write_lb = 0;
    int     rval;
    int     stale_rtn = 0;

    /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
    if (retval = get_mbs_n_lbs(s, &write_lb))
        goto errout;

    if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
        retval = MDDB_E_NOLOCBLK;
        goto errout;
    }

    /* If a multi-node set, then set md_set.s_status flag */
    if (lbp->lb_flags & MDDB_MNSET) {
        md_set_setstatus(setno, MD_SET_MNSET);
        /*
         * If data tag area had been allocated before set type was
         * known - free it now.
         */
        if (md_set[setno].s_dtp) {
            kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
            md_set[setno].s_dtp = NULL;
        }
    }

    /*
     * If the replica is in devid format, setup the devid incore ptr.
     */
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
            if (rip->ri_lbp == s->s_lbp) {
                did_icp = s->s_did_icp = rip->ri_did_icp;
                break;
            }
        }
        /*
         * If no devid incore info found - something has gone
         * wrong so errout.
         */
        if (rip == NULL) {
            retval = MDDB_E_NODEVID;
            goto errout;
        }

        /*
         * Add all blocks containing devids to free list.
         * Then remove addresses that actually contain devids.
         */
        did_dbp1 = did_icp->did_ic_dbp;
        while (did_dbp1) {
            if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
                0, dbtob(did_dbp1->db_blkcnt))) {
                retval = MDDB_E_NOSPACE;
                goto errout;
            }

            did_dbp1 = did_dbp1->db_next;
        }
        for (li = 0; li < lbp->lb_loccnt; li++) {
            did_info = &(did_icp->did_ic_blkp->blk_info[li]);
            if (!(did_info->info_flags & MDDB_DID_EXISTS))
                continue;

            if (mddb_devid_free_delete(s, did_info->info_firstblk,
                did_info->info_offset, did_info->info_length)) {
                /* unable to find disk block */
                retval = MDDB_E_NODEVID;
                goto errout;
            }
        }
    }

    /*
     * create mddb_mbaray, count all locators and active locators.
     */
    alc = 0;
    lc = 0;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        ddi_devid_t li_devid;

        lp = &lbp->lb_locators[li];

        if (lp->l_flags & MDDB_F_DELETED)
            continue;

        /* Count non-deleted replicas */
        lc++;

        /*
         * Use the devid of this locator to compare with the rip
         * list.  The scenario to watch out for here is that this
         * locator could be on a disk that is dead and there could
         * be a valid entry in the rip list for a different disk
         * that has been moved to the dead disks dev_t.  We don't
         * want to match with the moved disk.
         */
        li_devid = NULL;
        (void) mddb_devid_get(s, li, &li_devid, &minor_name);

        for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
            if (match_mddb(rip, li_devid, minor_name,
                md_expldev(lp->l_dev), lp->l_blkno)) {
                break;
            }
        }
        if (rip == NULL) {
            /*
             * If rip not found, then mark error in master block
             * so that no writes are later attempted to this
             * replica.  rip may not be setup if ridev
             * failed due to un-found driver name.
             */
            lp->l_flags |= MDDB_F_EMASTER;
            continue;
        }

        s->s_mbiarray[li] = rip->ri_mbip;

        lp->l_flags &= MDDB_F_ACTIVE;
        lp->l_flags |= (int)rip->ri_flags;

        if (rip->ri_transplant)
            lp->l_flags &= ~MDDB_F_ACTIVE;

        if (lp->l_flags & MDDB_F_LOCACC)
            alc++;
    }

    /* Save on a divide - calculate 50% + 1 up front */
    tlc = ((lc + 1) / 2);

    if (alc > tlc) {        /* alc > tlc        - OK */
        md_clr_setstatus(setno, MD_SET_STALE);
    } else if (alc < tlc) {     /* alc < tlc        - stale */
        md_set_setstatus(setno, MD_SET_STALE);
    } else if (lc & 1) {        /* alc == tlc && odd    - OK */
        md_clr_setstatus(setno, MD_SET_STALE);
    } else {            /* alc == tlc && even   - ? */
        /* Can do an accept, and are */
        if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
            md_clr_setstatus(setno, MD_SET_STALE);
        } else {        /* possibly has a mediator */
            if (mediate(s)) {
                md_set_setstatus(setno, MD_SET_STALE);
            } else {
                md_clr_setstatus(setno, MD_SET_STALE);
            }
        }

        /*
         * The mirrored_root_flag allows the sysadmin to decide to
         * start the local set in a read/write (non-stale) mode
         * when there are only 50% available mddbs on the system and
         * when the root file system is on a mirror.  This is useful
         * in a 2 disk system where 1 disk failure would cause an mddb
         * quorum failure and subsequent boot failures since the root
         * filesystem would be in a read-only state.
         */
        if (mirrored_root_flag == 1 && setno == 0 &&
            svm_bootpath[0] != 0) {
            md_clr_setstatus(setno, MD_SET_STALE);
        } else {
            if (md_get_setstatus(setno) & MD_SET_STALE) {
                /* Allow half mode - CAREFUL! */
                if (mddb_allow_half)
                    md_clr_setstatus(setno, MD_SET_STALE);
            }
        }

        /*
         * In a MN diskset,
         *  - if 50% mddbs are unavailable and this
         *      has been marked STALE above
         *  - master node isn't in the STALE state
         *  - this node isn't the master node (this node
         *      isn't the first node to join the set)
         * then clear the STALE state and set TOOFEW.
         *
         * If this node is the master node and set was marked STALE,
         * then the set stays STALE.
         *
         * If this node is not the master and this node's state is
         * STALE and the master node is not marked STALE,
         * then master node must be in the TOOFEW state or the
         * master is panic'ing.  A MN diskset can only be placed into
         * the STALE state by having the first node join the set
         * with <= 50% mddbs.  There's no way for a MN diskset to
         * transition between STALE and not-STALE states unless all
         * nodes are withdrawn from the diskset or all nodes in the
         * diskset are rebooted at the same time.
         *
         * So, mark this node's state as TOOFEW instead of STALE.
         */
        if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
            == (MD_SET_MNSET | MD_SET_STALE)) &&
            ((flag & MDDB_MN_STALE) == 0) &&
            (!(md_set[setno].s_am_i_master))) {
            md_clr_setstatus(setno, MD_SET_STALE);
            md_set_setstatus(setno, MD_SET_TOOFEW);
        }
    }

    /*
     * If a MN set is marked STALE on the other nodes,
     * mark it stale here.  Override all other considerations
     * such as a mediator or > 50% mddbs available.
     */
    if (md_get_setstatus(setno) & MD_SET_MNSET) {
        if (flag & MDDB_MN_STALE)
            md_set_setstatus(setno, MD_SET_STALE);
    }

    /*
     * read a good copy of the locator names
     * if an error occurs reading what is suppose
     * to be a good copy continue looking for another
     * good copy
     */
    s->s_lnp = NULL;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
            (lp->l_flags & MDDB_F_EMASTER))
            continue;

        /* Find rip entry for this locator if one exists */
        for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
            if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
                lp->l_blkno))
                break;
        }

        if (rip == NULL) {
            continue;
        }

        /*
         * Use the rip commitcnt since the commitcnt in lbp could
         * been cleared by selectlocator.  Looking for a replica with
         * the same commitcnt as the 'golden' copy in order to
         * get the same data.
         */
        if (rip->ri_commitcnt != lbp->lb_commitcnt) {
            continue;
        }

        /*
         * Now have a copy of the database that is equivalent
         * to the chosen locator block with respect to
         * inittime, identifier and commitcnt.   Trying the
         * equivalent databases in the order that they were
         * written will provide the most up to date data.
         */
        lp->l_flags |= readlocnames(s, li);
        if (s->s_lnp)
            break;
    }

    if (s->s_lnp == NULL) {
        retval = MDDB_E_NOLOCNMS;
        goto errout;
    }

    /*
     * read a good copy of the data base
     * if an error occurs reading what is suppose
     * to be a good copy continue looking for another
     * good copy
     */

    s->s_dbp = NULL;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
            (lp->l_flags & MDDB_F_EMASTER))
            continue;

        /* Find rip entry for this locator if one exists */
        for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
            if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
                lp->l_blkno))
                break;
        }

        if (rip == NULL) {
            continue;
        }

        /*
         * Use the rip commitcnt since the commitcnt in lbp could
         * been cleared by selectlocator.  Looking for a replica with
         * the same commitcnt as the 'golden' copy in order to
         * get the same data.
         */
        if (rip->ri_commitcnt != lbp->lb_commitcnt) {
            continue;
        }

        /*
         * Now have a copy of the database that is equivalent
         * to the chosen locator block with respect to
         * inittime, identifier and commitcnt.   Trying the
         * equivalent databases in the order that they were
         * written will provide the most up to date data.
         */
        lp->l_flags |= readcopy(s, li);

        if (s->s_dbp)
            break;
    }

    if (s->s_dbp == NULL) {
        retval = MDDB_E_NODIRBLK;
        goto errout;
    }

    lp->l_flags |= MDDB_F_MASTER;
    lp->l_flags |= MDDB_F_UP2DATE;

    /*
     * go through and find largest record;
     * Also fixup the user data area's
     */
    maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
        for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
            if (dep->de_flags & MDDB_F_OPT)
                getoptrecord(s, dep);
            else {
                allocuserdata(dep);
                maxrecsize = MAX(dep->de_recsize, maxrecsize);
            }

    if (maxrecsize > s->s_databuffer_size) {
        p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
        if (s->s_databuffer_size)
            kmem_free(s->s_databuffer, s->s_databuffer_size);
        s->s_databuffer = p;
        s->s_databuffer_size = maxrecsize;
    }

    /* If we can clear the tag data record, do it now. */
    /* Data tags not supported on MN sets */
    if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
        (!(md_get_setstatus(setno) & MD_SET_MNSET)))
        dt_setup(s, NULL);

    /* This will return non-zero if STALE or TOOFEW */
    /* This will write out chosen replica image to all replicas */
    stale_rtn = selectreplicas(s, MDDB_SCANALL);

    if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
        ddi_devid_t devidptr;

        /*
         * ignore the return value from selectreplicas because we
         * may have a STALE or TOOFEW set in the case of a partial
         * replicated diskset. We will fix that up later.
         */

        lbp = s->s_lbp;
        for (li = 0; li < lbp->lb_loccnt; li++) {
            did_info = &(did_icp->did_ic_blkp->blk_info[li]);

            if (did_info->info_flags & MDDB_DID_EXISTS) {
                devidptr = s->s_did_icp->did_ic_devid[li];
                lp = &lbp->lb_locators[li];
                for (rip = s->s_rip; rip != NULL;
                    rip = rip->ri_next) {
                    if (rip->ri_old_devid == 0)
                        continue;
                    if (ddi_devid_compare(rip->ri_old_devid,
                        devidptr) != 0) {
                        continue;
                    }
                    if (update_locatorblock(s,
                        md_expldev(lp->l_dev),
                        rip->ri_devid, rip->ri_old_devid)) {
                        goto errout;
                    }
                }
            }
        }
    } else {
        if (stale_rtn)
            goto errout;
    }

    /*
     * If the replica is in device id style - validate the device id's,
     * if present, in the locator block devid area.
     */
    newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        for (li = 0; li < lbp->lb_loccnt; li++) {
            newdev[li] = 0;
            lp = &lbp->lb_locators[li];
            if (lp->l_flags & MDDB_F_DELETED)
                continue;
            did_info = &(did_icp->did_ic_blkp->blk_info[li]);
            dev = md_expldev(lp->l_dev);
            if (did_info->info_flags & MDDB_DID_EXISTS) {
                /* Validate device id on current system */
                newdev[li] = dev;
                if (mddb_devid_validate(
                    did_icp->did_ic_devid[li],
                    &(newdev[li]),
                    did_info->info_minor_name) == 0) {
                    /* Set valid flag */
                    did_info->info_flags |= MDDB_DID_VALID;
                } else {
                    lp->l_flags |= MDDB_F_EMASTER;
                }
            } else if (!(MD_UPGRADE)) {
                /*
                 * If a device doesn't have a device id,
                 * check if there is now a device ID
                 * associated with device.  If one exists,
                 * add it to the locator block devid area.
                 * If there's not enough space to add it,
                 * print a warning.
                 * Don't do this during upgrade.
                 */
                dev_t ddi_dev = md_dev64_to_dev(dev);
                if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
                    DDI_SUCCESS) {
                    if (ddi_lyr_get_minor_name(ddi_dev,
                        S_IFBLK, &minor_name)
                        == DDI_SUCCESS) {
                        if (mddb_devid_add(s, li,
                            ret_devid, minor_name)) {
                            cmn_err(CE_WARN,
                                "Not enough space"
                                " in metadevice"
                                " state"
                                " database\n");
                            cmn_err(CE_WARN,
                                "to add relocation"
                                " information for"
                                " device:\n");
                            cmn_err(CE_WARN,
                                " major = %d, "
                                " minor = %d\n",
                                getmajor(ddi_dev),
                                getminor(ddi_dev));
                        } else {
                            write_lb = 1;
                        }
                        kmem_free(minor_name,
                            strlen(minor_name) + 1);
                    }
                    ddi_devid_free(ret_devid);
                }
            }
        }

        /*
         * If a device has a valid device id and if the dev_t
         * associated with the device id has changed, update the
         * driver name, minor num and dev_t in the local and side
         * locators to match the dev_t that the system currently
         * associates with the device id.
         *
         * Don't do this during upgrade.
         */
        if (!(MD_UPGRADE)) {
            for (li = 0; li < lbp->lb_loccnt; li++) {
                lp = &lbp->lb_locators[li];
                if (lp->l_flags & MDDB_F_DELETED)
                    continue;
                did_info = &(did_icp->did_ic_blkp->blk_info
                    [li]);
                if ((did_info->info_flags & MDDB_DID_VALID) &&
                    !(did_info->info_flags &
                    MDDB_DID_UPDATED)) {
                    if (lbp->lb_flags & MDDB_MNSET) {
                        int j;
                        int index = -1;
                        mnlbp = (mddb_mnlb_t *)lbp;
                        for (j = 0; j < MD_MNMAXSIDES;
                            j++) {
                            mnslp = &mnlbp->
                                lb_mnsidelocators[j]
                                [li];
                            if (mnslp->mnl_sideno ==
                                s->s_sideno)
                                break;
                            if (mnslp->mnl_sideno ==
                                0)
                                index = j;
                        }
                        if (j == MD_MNMAXSIDES) {
                            /*
                             * No match found; take
                             * empty
                             */
                            mnslp = &mnlbp->
                                lb_mnsidelocators
                                [index][li];
                            write_lb = 1;
                            mnslp->mnl_mnum =
                                md_getminor(newdev
                                [li]);
                        } else if (mnslp->mnl_mnum !=
                            md_getminor(newdev[li])) {
                            write_lb = 1;
                            mnslp->mnl_mnum =
                                md_getminor(newdev
                                [li]);
                        }
                    } else {
                        slp = &lbp->
                            lb_sidelocators[s->s_sideno]
                            [li];
                        if (slp->l_mnum !=
                            md_getminor(newdev[li])) {
                            write_lb = 1;
                            slp->l_mnum =
                                md_getminor(newdev
                                [li]);
                        }
                    }
                    name = ddi_major_to_name(md_getmajor(
                        newdev[li]));
                    if (lbp->lb_flags & MDDB_MNSET)
                        i = mnslp->mnl_drvnm_index;
                    else
                        i = slp->l_drvnm_index;
                    if (strncmp(lbp->lb_drvnm[i].dn_data,
                        name, lbp->lb_drvnm[i].dn_len) !=
                        0) {
                        /* Driver name has changed */
                        len = strlen(name);
                        /* Look for the driver name */
                        for (i = 0; i < MDDB_DRVNMCNT;
                            i++) {
                            if (lbp->lb_drvnm[i].
                                dn_len != len)
                                continue;
                            if (strncmp(lbp->
                                lb_drvnm[i].dn_data,
                                name, len) == 0)
                                break;
                        }
                        /* Didn't find one, add it */
                        if (i == MDDB_DRVNMCNT) {
                            for (i = 0; i <
                                MDDB_DRVNMCNT;
                                i++) {
                                if (lbp->
                                    lb_drvnm[i].
                                    dn_len == 0)
                                    break;
                            }
                            if (i ==
                                MDDB_DRVNMCNT) {
                                cmn_err(CE_WARN,
                                    "Unable to "
                                    " update "
                                    "driver "
                                    " name for "
                                    "dev:  "
                                    "major = %d"
                                    ", minor = "
                                    "%d\n",
                                    md_getmajor(
                                    newdev[li]),
                                    md_getminor(
                                    newdev
                                    [li]));
                                continue;
                            }
                            (void) strncpy(lbp->
                                lb_drvnm[i].dn_data,
                                name, MD_MAXDRVNM);
                            lbp->lb_drvnm[i].
                                dn_len = (uchar_t)
                                strlen(name);
                        }
                        /* Fill in the drvnm index */
                        if (lbp->lb_flags &
                            MDDB_MNSET)
                            mnslp->mnl_drvnm_index =
                                i;
                        else
                            slp->l_drvnm_index = i;
                        write_lb = 1;
                    }
                    did_info->info_flags |=
                        MDDB_DID_UPDATED;
                }
            }
        }
    }
    kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);

    /*
     * If locator block has been changed by get_mbs_n_lbs,
     * by addition of new device id, by updated minor name or
     * by updated driver name - write out locator block.
     */
    if (write_lb) {
        rval = push_lb(s);
        (void) upd_med(s, "load_old_replicas(0)");
        if (rval)
            goto errout;
    }

    /*
     * If the tag was moved, allocated, or a BADTAG was seen for some other
     * reason, then make sure tags are written to all the replicas.
     * Data tags not supported on MN sets.
     */
    if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
        if (! (lc = dt_alloc_if_needed(s))) {
            for (li = 0; li < lbp->lb_loccnt; li++) {
                lp = &lbp->lb_locators[li];

                if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
                    (lp->l_flags & MDDB_F_EMASTER))
                    continue;

                if (lp->l_flags & MDDB_F_BADTAG) {
                    lc = 1;
                    break;
                }
            }
        }

        if (lc) {
            md_set_setstatus(setno, MD_SET_TAGDATA);
            md_clr_setstatus(setno, MD_SET_BADTAG);
            (void) selectreplicas(s, MDDB_SCANALL);
        }
    }

errout:

    /* Free extraneous rip components. */
    for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
        /* Get rid of lbp's and dtp's */

        if (rip->ri_lbp != lbp) {
            if (rip->ri_dtp != (mddb_dt_t *)NULL) {
                kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
                rip->ri_dtp = (mddb_dt_t *)NULL;
            }

            if (rip->ri_devid != (ddi_devid_t)NULL) {
                sz = (int)ddi_devid_sizeof(rip->ri_devid);
                kmem_free((caddr_t)rip->ri_devid, sz);
                rip->ri_devid = (ddi_devid_t)NULL;
            }
            if (rip->ri_old_devid != (ddi_devid_t)NULL) {
                sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
                kmem_free((caddr_t)rip->ri_old_devid, sz);
                rip->ri_old_devid = (ddi_devid_t)NULL;
            }

            if (rip->ri_lbp != (mddb_lb_t *)NULL) {
                mddb_devid_icp_free(&rip->ri_did_icp,
                    rip->ri_lbp);

                kmem_free((caddr_t)rip->ri_lbp,
                    dbtob(rip->ri_lbp->lb_blkcnt));
                rip->ri_lbp = (mddb_lb_t *)NULL;
            }
        }

        if (lbp != NULL) {
            for (li = 0; li < lbp->lb_loccnt; li++) {
                lp = &lbp->lb_locators[li];
                if (lp->l_flags & MDDB_F_DELETED)
                    continue;
                if (rip->ri_dev == md_expldev(lp->l_dev) &&
                    rip->ri_blkno == lp->l_blkno)
                    break;
            }
            if (li < lbp->lb_loccnt)
                continue;
        }

        /*
         * Get rid of mbp's:
         *  if lbp, those out of lb_loccnt bounds
         *  if !lbp,  all of them.
         */
        if (rip->ri_mbip) {
            md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
            if (dev64 != NODEV64)
                mddb_devclose(dev64);

            free_mbipp(&rip->ri_mbip);
        }
        /*
         * Turn off MDDB_F_EMASTER flag in a diskset since diskset
         * code always ends up calling ridev for all replicas
         * before calling load_old_replicas.  ridev will reset
         * MDDB_F_EMASTER flag if flag was due to unresolved devid.
         */
        if (setno != MD_LOCAL_SET)
            rip->ri_flags &= ~MDDB_F_EMASTER;
    }
    return (retval);
}

/*
 * Given the devt from the md.conf info, get the devid for the device.
 */
static void
lookup_db_devid(mddb_cfg_loc_t *cl)
{
    dev_t       ldev;
    ddi_devid_t devid;
    char        *minor;

    if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
        cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
        return;
    }

    ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
    if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
        cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
            cl->l_driver, cl->l_mnum);
        return;
    }

    if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
        cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
            cl->l_mnum);
        return;
    }

    cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
    cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
    cl->l_devid = (uint64_t)(uintptr_t)devid;
    (void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);

    kmem_free(minor, strlen(minor) + 1);
}

/*
 * grab driver name, minor, block and devid out of
 * strings like "driver:minor:block:devid"
 */
static int
parse_db_loc(
    char        *str,
    mddb_cfg_loc_t  *clp
)
{
    char        *p, *e;
    char        *minor_name;
    ddi_devid_t ret_devid;

    clp->l_dev = 0;
    p = clp->l_driver;
    e = p + sizeof (clp->l_driver) - 1;
    while ((*str != ':') && (*str != '\0') && (p < e))
        *p++ = *str++;
    *p = '\0';
    if (*str++ != ':')
        return (-1);
    clp->l_mnum = 0;
    while (ISNUM(*str)) {
        clp->l_mnum *= 10;
        clp->l_mnum += *str++ - '0';
    }
    if (*str++ != ':')
        return (-1);
    clp->l_blkno = 0;
    while (ISNUM(*str)) {
        clp->l_blkno *= 10;
        clp->l_blkno += *str++ - '0';
    }
    if (*str++ != ':')
        return (-1);

    /*
     * If the md_devid_destroy flag is set, ignore the device ids.
     * This is only to used in a catastrophic failure case.  Examples
     * would be where the device id of all drives in the system
     * (especially the mirror'd root drives) had been changed
     * by firmware upgrade or by a patch to an existing disk
     * driver.  Another example would be in the case of non-unique
     * device ids due to a bug.  The device id would be valid on
     * the system, but would return the wrong dev_t.
     */
    if (md_devid_destroy) {
        clp->l_devid_flags = 0;
        clp->l_devid = (uint64_t)NULL;
        clp->l_devid_sz = 0;
        clp->l_old_devid = (uint64_t)NULL;
        clp->l_old_devid_sz = 0;
        clp->l_minor_name[0] = '\0';
        return (0);
    }

    if (ddi_devid_str_decode(str,
        (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
        return (-1);

    clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
    clp->l_devid_flags = 0;
    clp->l_old_devid = (uint64_t)NULL;
    clp->l_old_devid_sz = 0;

    /* If no device id associated with device, just return */
    if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
        clp->l_devid_sz = 0;
        clp->l_minor_name[0] = '\0';
        if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
            md_keep_repl_state == 0) {
            /*
             * No devid in md.conf; we're in recovery mode so
             * lookup the devid for the device as specified by
             * the devt in md.conf.
             */
            lookup_db_devid(clp);
        }
        return (0);
    }

    clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
        MDDB_DEVID_SZ;
    clp->l_devid_sz = (int)ddi_devid_sizeof(
        (ddi_devid_t)(uintptr_t)clp->l_devid);
    (void) strcpy(clp->l_minor_name, minor_name);
    kmem_free(minor_name, strlen(minor_name) + 1);

    return (0);
}

/*
 * grab driver name, minor, and block out of
 * strings like "driver:minor:block:devid driver:minor:block:devid ..."
 */
static void
parse_db_string(
    char        *str
)
{
    char        *p, *e;
    mddb_cfg_loc_t  *cl;
    char        restore_space;

    /* CSTYLED */
    cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
    for (p = str; (*p != '\0'); ) {
        for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
            ;
        if (*p == '\0')
            break;
        for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
            ;
        /*
         * Only give parse_db_loc 1 entry, so stuff a null into
         * the string if we're not at the end.  We need to save this
         * char and restore it after call.
         */
        restore_space = '\0';
        if (*e != '\0') {
            restore_space = *e;
            *e = '\0';
        }
        if (parse_db_loc(p, cl) != 0) {
            cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
        } else {
            (void) ridev(
                &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
                cl, NULL, MDDB_F_PTCHED);
            if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
                kmem_free((caddr_t)(uintptr_t)cl->l_devid,
                    cl->l_devid_sz);
            }
        }
        if (restore_space != '\0') {
            *e = restore_space;
        }
        p = e;
    }
    kmem_free(cl, sizeof (mddb_cfg_loc_t));
}

/*
 * grab database locations supplied by md.conf as properties
 */
static void
parse_db_strings(void)
{
    int     bootlist_id;
    int     proplen;
    /*
     * size of _bootlist_name should match uses of line and entry in
     * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
     */
    char        _bootlist_name[MDDB_BOOTLIST_MAX_LEN];
    char        *bootlist_name;
    caddr_t     prop;

/*
 * Step through the bootlist properties one at a time by forming the
 * correct name, fetching the property, parsing the property and
 * then freeing the memory.  If a property does not exist or returns
 * some form of error just ignore it.  There is no guarantee that
 * the properties will always exist in sequence, for example
 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
 * mddb_bootlist3 existing.
 */
    bootlist_name = &_bootlist_name[0];
    for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {

        proplen = 0;
        (void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);

        if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
            DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
            &proplen) != DDI_PROP_SUCCESS)
            continue;

        if (proplen <= 0)
            continue;

        if (md_init_debug)
            cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);

        parse_db_string(prop);
        kmem_free(prop, proplen);
    }
}

static int
initit(
    set_t       setno,
    int     flag
)
{
    int     i;
    mddb_set_t  *s;
    mddb_lb_t   *lbp;       /* pointer to locator block */
    mddb_ln_t   *lnp;       /* pointer to locator names */
    mddb_db_t   *dbp;       /* pointer to directory block */
    mddb_did_blk_t  *did_blkp;  /* pointer to Device ID block */
    mddb_did_ic_t   *did_icp;   /* pointer to Device ID incore area */
    mddb_bf_t   *bfp;
    side_t      sideno;
    side_t      maxsides;
    mddb_block_t    lb_blkcnt;
    int     retval = 0;
    md_dev64_t  dev;
    mddb_mnlb_t *mnlbp;
    int     devid_flag;

    /* single thread's all loads/unloads of set's */
    mutex_enter(&mddb_lock);
    mutex_enter(SETMUTEX(setno));

    if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
        mutex_exit(SETMUTEX(setno));
        mutex_exit(&mddb_lock);
        return (MDDB_E_NOTNOW);
    }

    s = (mddb_set_t *)md_set[setno].s_db;

    single_thread_start(s);

    /*
     * init is already underway, block. Return success.
     */
    if (s->s_lbp) {
        single_thread_end(s);
        mutex_exit(SETMUTEX(setno));
        mutex_exit(&mddb_lock);
        return (0);
    }

    uniqtime32(&s->s_inittime);

    /* grab database locations patched by /etc/system */
    if (setno == MD_LOCAL_SET)
        parse_db_strings();

    s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
        sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);

    s->s_zombie = 0;
    s->s_staledeletes = 0;
    s->s_optcmtcnt = 0;
    s->s_opthavelck = 0;
    s->s_optwantlck = 0;
    s->s_optwaiterr = 0;
    s->s_opthungerr = 0;

    /*
     * KEEPTAG can never be set for a MN diskset since no tags are
     * allowed to be stored in a MN diskset.  No way to check
     * if this is a MN diskset or not at this point since the mddb
     * hasn't been read in from disk yet.  (flag will only have
     * MUTLINODE bit set if a new set is being created.)
     */
    if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
        dt_setup(s, NULL);

    md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);

    for (i = 0; i < mddb_maxbufheaders; i++) {
        bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
        sema_init(&bfp->bf_buf.b_io, 0, NULL,
            SEMA_DEFAULT, NULL);
        sema_init(&bfp->bf_buf.b_sem, 0, NULL,
            SEMA_DEFAULT, NULL);
        bfp->bf_buf.b_offset = -1;
        freebuffer(s, bfp);
    }

    retval = load_old_replicas(s, flag);
    /* If 0 return value - success */
    if (! retval) {
        single_thread_end(s);
        mutex_exit(SETMUTEX(setno));
        mutex_exit(&mddb_lock);
        return (0);
    }

    /*
     * If here, then the load_old_replicas() failed
     */


    /* If the database was supposed to exist. */
    if (flag & MDDB_MUSTEXIST) {
        if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
            for (i = 0; i < mddb_maxcopies;  i++) {
                if (! s->s_mbiarray[i])
                    continue;
                dev = md_expldev(
                    s->s_lbp->lb_locators[i].l_dev);
                dev = md_xlate_targ_2_mini(dev);
                if (dev != NODEV64)
                    mddb_devclose(dev);

                free_mbipp(&s->s_mbiarray[i]);
            }

            kmem_free((caddr_t)s->s_mbiarray,
                sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
            s->s_mbiarray = NULL;
        }

        if (s->s_lnp != (mddb_ln_t *)NULL) {
            kmem_free((caddr_t)s->s_lnp,
                dbtob(s->s_lbp->lb_lnblkcnt));
            s->s_lnp = (mddb_ln_t *)NULL;
        }

        mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);

        if (s->s_lbp != (mddb_lb_t *)NULL) {
            kmem_free((caddr_t)s->s_lbp,
                dbtob(s->s_lbp->lb_blkcnt));
            s->s_lbp = (mddb_lb_t *)NULL;
        }

        while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
            kmem_free((caddr_t)bfp, sizeof (*bfp));

        single_thread_end(s);
        mutex_exit(SETMUTEX(setno));
        mutex_exit(&mddb_lock);

        if (retval == MDDB_E_TAGDATA)
            return (retval);

        /* Want a bit more detailed error messages */
        if (mddb_db_err_detail)
            return (retval);

        return (MDDB_E_NODB);
    }


    /*
     * MDDB_NOOLDOK set - Creating a new database, so do
     * more initialization.
     */

    lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
        MDDB_LOCAL_LBCNT : MDDB_LBCNT);
    if (flag & MDDB_MULTINODE) {
        lb_blkcnt = MDDB_MNLBCNT;
    }

    if (s->s_lbp == NULL)
        s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
    lbp = s->s_lbp;

    bzero((caddr_t)lbp, dbtob(lb_blkcnt));
    lbp->lb_setno = setno;
    lbp->lb_magic = MDDB_MAGIC_LB;
    if (flag & MDDB_MULTINODE) {
        lbp->lb_revision = MDDB_REV_MNLB;
    } else {
        lbp->lb_revision = MDDB_REV_LB;
    }
    lbp->lb_inittime = s->s_inittime;
    if (flag & MDDB_MULTINODE) {
        mnlbp = (mddb_mnlb_t *)lbp;
        for (i = 0; i < MDDB_NLB; i++) {
            for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
                mddb_mnsidelocator_t    *mnslp;
                mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
                mnslp->mnl_mnum = NODEV32;
                mnslp->mnl_sideno = 0;
                mnslp->mnl_drvnm_index = 0;
            }
        }
    } else {
        maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
        for (i = 0; i < MDDB_NLB; i++) {
            for (sideno = 0; sideno < maxsides; sideno++) {
                mddb_sidelocator_t  *slp;
                slp = &lbp->lb_sidelocators[sideno][i];
                slp->l_mnum = NODEV32;
            }
        }
    }
    lbp->lb_blkcnt = lb_blkcnt;

    /* lb starts on block 0 */
    /* locator names starts after locator block */
    lbp->lb_lnfirstblk = lb_blkcnt;
    if (flag & MDDB_MULTINODE) {
        lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
    } else {
        lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
            MDDB_LOCAL_LNCNT : MDDB_LNCNT);
    }

    if (flag & MDDB_MULTINODE) {
        /* Creating a multinode diskset */
        md_set_setstatus(setno, MD_SET_MNSET);
        lbp->lb_flags |= MDDB_MNSET;
    }

    /* Data portion of mddb located after locator names */
    lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;

    /* the btodb that follows is converting the directory block size */
    /* Data tag part of mddb located after first block of mddb data */
    lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
        btodb(MDDB_BSIZE));
    /* Data tags are not used in MN diskset - so set count to 0 */
    if (flag & MDDB_MULTINODE)
        lbp->lb_dtblkcnt = (mddb_block_t)0;
    else
        lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;


    lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
    lnp->ln_magic = MDDB_MAGIC_LN;
    if (flag & MDDB_MULTINODE) {
        lnp->ln_revision = MDDB_REV_MNLN;
    } else {
        lnp->ln_revision = MDDB_REV_LN;
    }
    s->s_lnp = lnp;

    /*
     * Set up Device ID portion of Locator Block.
     * Do not set locator to device id style if
     * md_devid_destroy is 1 and md_keep_repl_state is 1
     * (destroy all device id data and keep replica in
     * non device id mode).
     *
     * This is logically equivalent to set locator to
     * device id style if md_devid_destroy is 0 or
     * md_keep_repl_state is 0.
     *
     * In SunCluster environment, device id mode is disabled
     * which means diskset will be run in non-devid mode.  For
     * localset, the behavior will remain intact and run in
     * device id mode.
     *
     * In multinode diskset devids are turned off.
     */
    devid_flag = 1;
    if (cluster_bootflags & CLUSTER_CONFIGURED)
        if (setno != MD_LOCAL_SET)
            devid_flag = 0;
    if (flag & MDDB_MULTINODE)
        devid_flag = 0;
    if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
        devid_flag = 0;
    /*
     * if we weren't devid style before and md_keep_repl_state=1
     * we need to stay non-devid
     */
    if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
        (md_keep_repl_state == 1))
        devid_flag = 0;
    if (devid_flag) {
        lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
            lbp->lb_dtblkcnt;
        lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
        lbp->lb_flags |= MDDB_DEVID_STYLE;

        did_icp = (mddb_did_ic_t *)kmem_zalloc
            (sizeof (mddb_did_ic_t), KM_SLEEP);
        did_blkp = (mddb_did_blk_t *)
            kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
        did_blkp->blk_magic = MDDB_MAGIC_DI;
        did_blkp->blk_revision = MDDB_REV_DI;
        did_icp->did_ic_blkp = did_blkp;
        s->s_did_icp = did_icp;
    }

    setidentifier(s, &lbp->lb_ident);
    uniqtime32(&lbp->lb_timestamp);
    dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
    dbp->db_magic = MDDB_MAGIC_DB;
    dbp->db_revision = MDDB_REV_DB;
    uniqtime32(&dbp->db_timestamp);
    dbp->db_nextblk = 0;
    dbp->db_firstentry = NULL;
    dbp->db_blknum = lbp->lb_dbfirstblk;
    dbp->db_recsum = MDDB_GLOBAL_XOR;
    s->s_dbp = dbp;
    single_thread_end(s);
    mutex_exit(SETMUTEX(setno));
    mutex_exit(&mddb_lock);
    return (0);
}

mddb_set_t *
mddb_setenter(
    set_t       setno,
    int     flag,
    int     *errorcodep
)
{
    mddb_set_t  *s;
    int     err = 0;
    size_t      sz = sizeof (void *) * MD_MAXUNITS;

    mutex_enter(SETMUTEX(setno));
    if (! md_set[setno].s_db) {
        mutex_exit(SETMUTEX(setno));
        if (errorcodep != NULL)
            *errorcodep = MDDB_E_NOTOWNER;
        return (NULL);
    }

    /* Allocate s_un and s_ui arrays if not already present. */
    if (md_set[setno].s_un == NULL) {
        md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
        if (md_set[setno].s_un == NULL) {
            mutex_exit(SETMUTEX(setno));
            if (errorcodep != NULL)
                *errorcodep = MDDB_E_NOTOWNER;
            return (NULL);
        }
    }
    if (md_set[setno].s_ui == NULL) {
        md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
        if (md_set[setno].s_ui == NULL) {
            mutex_exit(&md_set[setno].s_dbmx);
            kmem_free(md_set[setno].s_un, sz);
            md_set[setno].s_un = NULL;
            if (errorcodep != NULL)
                *errorcodep = MDDB_E_NOTOWNER;
            return (NULL);
        }
    }
    s = (mddb_set_t *)md_set[setno].s_db;
    if (s->s_lbp)
        return (s);

    if (flag & MDDB_NOINIT)
        return (s);

    /*
     * Release the set mutex - it will be acquired and released in
     * initit after acquiring the mddb_lock.  This is done to assure
     * that mutexes are always acquired in the same order to prevent
     * possible deadlock
     */
    mutex_exit(SETMUTEX(setno));

    if ((err = initit(setno, flag)) != 0) {
        if (errorcodep != NULL)
            *errorcodep = err;
        return (NULL);
    }

    mutex_enter(SETMUTEX(setno));
    return ((mddb_set_t *)md_set[setno].s_db);
}

/*
 * Release the set lock for a given set.
 *
 * In a MN diskset, this routine may send messages to the rpc.mdcommd
 * in order to have the slave nodes re-parse parts of the mddb.
 * Messages are only sent if the global ioctl lock is not held.
 *
 * With the introduction of multi-threaded ioctls, there is no way
 * to determine which thread(s) are holding the ioctl lock.  So, if
 * the ioctl lock is held (by process X) process X will send the
 * messages to the slave nodes when process X releases the ioctl lock.
 */
void
mddb_setexit(
    mddb_set_t  *s
)
{
    md_mn_msg_mddb_parse_t      *mddb_parse_msg;
    md_mn_kresult_t         *kresult;
    mddb_lb_t           *lbp = s->s_lbp;
    int             i;
    int             rval = 1;

    /*
     * If not a MN diskset OR
     * a MN diskset but this node isn't master,
     * then release the mutex.
     */
    if (!(MD_MNSET_SETNO(s->s_setno)) ||
        ((MD_MNSET_SETNO(s->s_setno)) &&
        (!md_set[s->s_setno].s_am_i_master))) {
        mutex_exit(SETMUTEX(s->s_setno));
        return;
    }

    /*
     * If global ioctl lock is held, then send no messages,
     * just release mutex and return.
     *
     */
    if (md_status & MD_GBL_IOCTL_LOCK) {
        mutex_exit(SETMUTEX(s->s_setno));
        return;
    }

    /*
     * This thread is not holding the ioctl lock, so drop the set
     * lock, send messages to slave nodes to reparse portions
     * of the mddb and return.
     *
     * If the block parse flag is set, do not send parse messages.
     * This flag is set when master is adding a new mddb that would
     * cause parse messages to be sent to the slaves, but the slaves
     * don't have knowledge of the new mddb yet since the mddb add
     * operation hasn't been run on the slave nodes yet.  When the
     * master unblocks the parse flag, the parse messages will be
     * generated.
     *
     * If s_mn_parseflags_sending is non-zero, then another thread
     * is already currently sending a parse message, so just release
     * the mutex and return.  If an mddb change occurred that results
     * in a parse message to be generated, the thread that is currently
     * sending a parse message would generate the additional parse message.
     *
     * If s_mn_parseflags_sending is zero and parsing is not blocked,
     * then loop until s_mn_parseflags is 0 (until there are no more
     * messages to send).
     * While s_mn_parseflags is non-zero,
     *  put snapshot of parse_flags in s_mn_parseflags_sending
     *  set s_mn_parseflags to zero
     *  release mutex
     *  send message
     *  re-grab mutex
     *  set s_mn_parseflags_sending to zero
     */
    mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
    while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
        (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
        (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
        /* Grab snapshot of parse flags */
        s->s_mn_parseflags_sending = s->s_mn_parseflags;
        s->s_mn_parseflags = 0;

        mutex_exit(SETMUTEX(s->s_setno));

        /*
         * Send the message to the slaves to re-parse
         * the indicated portions of the mddb. Send the status
         * of the 50 mddbs in this set so that slaves know which
         * mddbs that the master node thinks are 'good'.
         * Otherwise, slave may reparse, but from wrong replica.
         */
        mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
        for (i = 0; i < MDDB_NLB; i++) {
            mddb_parse_msg->msg_lb_flags[i] =
                lbp->lb_locators[i].l_flags;
        }
        kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
        while (rval != 0) {
            rval = mdmn_ksend_message(s->s_setno,
                MD_MN_MSG_MDDB_PARSE, 0, 0,
                (char *)mddb_parse_msg,
                sizeof (md_mn_msg_mddb_parse_t), kresult);
            if (rval != 0)
                cmn_err(CE_WARN, "mddb_setexit: Unable to send "
                    "mddb update message to other nodes in "
                    "diskset %s\n", s->s_setname);
        }
        kmem_free(kresult, sizeof (md_mn_kresult_t));

        /*
         * Re-grab mutex to clear sending field and to
         * see if another parse message needs to be generated.
         */
        mutex_enter(SETMUTEX(s->s_setno));
        s->s_mn_parseflags_sending = 0;
    }
    kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
    mutex_exit(SETMUTEX(s->s_setno));
}

static void
mddb_setexit_no_parse(
    mddb_set_t  *s
)
{
    mutex_exit(SETMUTEX(s->s_setno));
}

uint_t
mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
{
    uint_t          li;
    mddb_lb_t       *lbp = s->s_lbp;
    mddb_locator_t      *lp;
    ddi_devid_t     ret_devid;
    uint_t          devid_len;
    dev_t           ddi_dev;
    mddb_did_ic_t       *did_icp;
    mddb_did_blk_t      *did_blkp;
    char            *minor_name;
    size_t          sz;
    int         retval;
    int         err;
    md_dev64_t      dev64; /* tmp var to make code look better */


    /* Need disk block(s) to hold mddb_did_blk_t */
    *blk_cnt = MDDB_DID_BLOCKS;

    if (doit) {
        /*
         * Alloc mddb_did_blk_t disk block and fill in header area.
         * Don't fill in did magic number until end of routine so
         * if machine panics in the middle of conversion, the
         * device id information will be thrown away at the
         * next snarfing of this set.
         * Need to set DEVID_STYLE so that mddb_devid_add will
         * function properly.
         */
        /* grab the mutex */
        if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
            return (1);
        }
        single_thread_start(s);
        lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
        if (lbp->lb_didfirstblk == 0) {
            single_thread_end(s);
            mddb_setexit(s);
            return (1);
        }
        lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
        did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
            KM_SLEEP);
        did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
            KM_SLEEP);

        did_blkp->blk_revision = MDDB_REV_DI;
        did_icp->did_ic_blkp = did_blkp;
        s->s_did_icp = did_icp;
        lbp->lb_flags |= MDDB_DEVID_STYLE;
    }

    /* Fill in information in mddb_did_info_t array */
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_DELETED)
            continue;

        dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
        ddi_dev = md_dev64_to_dev(dev64);
        if (ddi_dev == NODEV) {
            /*
             * No translation available for replica.
             * Could fail conversion to device id replica,
             * but instead will just continue with next
             * replica in list.
             */
            continue;
        }
        if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
            /*
             * Just count each devid as at least 1 block.  This
             * is conservative since several device id's may fit
             * into 1 disk block, but it's better to overestimate
             * the number of blocks needed than to underestimate.
             */
            devid_len = (int)ddi_devid_sizeof(ret_devid);
            *blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
            if (doit) {
                if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
                    &minor_name) == DDI_SUCCESS) {
                    if (mddb_devid_add(s, li, ret_devid,
                        minor_name)) {
                        cmn_err(CE_WARN,
                            "Not enough space in metadb"
                            " to add device id for"
                            "  dev: major = %d, "
                            "minor = %d\n",
                            getmajor(ddi_dev),
                            getminor(ddi_dev));
                    }
                    sz = strlen(minor_name) + 1;
                    kmem_free(minor_name, sz);
                }
            }
            ddi_devid_free(ret_devid);
        }
    }

    if (doit) {
        did_blkp->blk_magic = MDDB_MAGIC_DI;
        retval = push_lb(s);
        (void) upd_med(s, "mddb_lb_did_convert(0)");
        single_thread_end(s);
        mddb_setexit(s);
        if (retval != 0)
            return (1);
    }

    return (0);
}

static mddb_set_t *
init_set(
    mddb_config_t   *cp,
    int     flag,
    int     *errp
)
{
    mddb_set_t  *s;
    char        *setname = NULL;
    set_t       setno = MD_LOCAL_SET;
    side_t      sideno = 0;
    struct timeval32 *created = NULL;

    if (cp != NULL) {
        setname = cp->c_setname;
        setno = cp->c_setno;
        sideno = cp->c_sideno;
        created = &cp->c_timestamp;
    }

    if (setno >= MD_MAXSETS)
        return ((mddb_set_t *)NULL);

    if (md_set[setno].s_db)
        return (mddb_setenter(setno, flag, errp));

    s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);

    cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
    cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
    cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
    cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
    cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);

    s->s_setno = setno;
    s->s_sideno = sideno;
    if (setno == MD_LOCAL_SET) {
        (void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial),
            "%u", zone_get_hostid(NULL));
    } else {
        s->s_ident.createtime = *created;
        s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
            KM_SLEEP);
        (void) strcpy(s->s_setname, setname);
    }

    /* have a config struct,  copy mediator information */
    if (cp != NULL)
        s->s_med = cp->c_med;       /* structure assignment */

    md_set[setno].s_db = (void *) s;

    SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);

    return (mddb_setenter(setno, flag, errp));
}

void
mddb_unload_set(
    set_t       setno
)
{

    mddb_set_t  *s;
    mddb_db_t   *dbp, *adbp = NULL;
    mddb_de_ic_t    *dep, *dep2;
    mddb_bf_t   *bfp;
    int     i;
    md_dev64_t  dev;

    if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
        return;

    single_thread_start(s);

    s->s_opthavequeuinglck = 0;
    s->s_optwantqueuinglck = 0;

    for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
        for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
            if (dep->de_rb_userdata != NULL) {
                if (dep->de_icreqsize)
                    kmem_free(dep->de_rb_userdata_ic,
                        dep->de_icreqsize);
                else
                    kmem_free(dep->de_rb_userdata,
                        dep->de_reqsize);
            }
            kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
            dep2 = dep->de_next;
            kmem_free((caddr_t)dep, sizeofde(dep));
        }
        adbp = dbp->db_next;
        kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
    }
    s->s_dbp = (mddb_db_t *)NULL;

    free_rip(&s->s_rip);

    for (i = 0; i < mddb_maxcopies;  i++) {
        if (! s->s_mbiarray)
            break;

        if (! s->s_mbiarray[i])
            continue;

        dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
        dev = md_xlate_targ_2_mini(dev);
        if (dev != NODEV64)
            mddb_devclose(dev);

        free_mbipp(&s->s_mbiarray[i]);
    }

    if (s->s_mbiarray) {
        kmem_free((caddr_t)s->s_mbiarray,
            sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
        s->s_mbiarray = (mddb_mb_ic_t **)NULL;
    }

    if (s->s_lnp) {
        kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
        s->s_lnp = (mddb_ln_t *)NULL;
    }

    if (s->s_lbp) {
        mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
        kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
        s->s_lbp = (mddb_lb_t *)NULL;
    }

    if (s->s_freebitmap) {
        kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
        s->s_freebitmap = NULL;
        s->s_freebitmapsize = 0;
    }

    while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
        kmem_free((caddr_t)bfp, sizeof (*bfp));

    if (s->s_databuffer_size) {
        kmem_free(s->s_databuffer, s->s_databuffer_size);
        s->s_databuffer_size = 0;
    }

    if (s->s_setname != NULL)
        kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);

    /* Data tags not supported on MN sets. */
    if (!(md_get_setstatus(setno) & MD_SET_MNSET))
        dtl_freel(&s->s_dtlp);

    md_set[setno].s_db = NULL;
    ASSERT(s->s_singlelockwanted == 0);
    kmem_free(s, sizeof (mddb_set_t));

    /* Take care of things setup in the md_set array */
    if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
        if (md_set[setno].s_dtp) {
            kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
            md_set[setno].s_dtp = NULL;
        }
    }

    md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
        MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
        MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
        MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
        MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);

    mutex_exit(SETMUTEX(setno));
}

/*
 * returns 0 if name can be put into locator block
 * returns 1 if locator block prefixes are all used
 *
 * Takes splitname (suffix, prefix, sideno) and
 * stores it in the locator name structure.
 * For traditional diskset, the sideno is the index into the suffixes
 * array in the locator name structure.
 * For the MN diskset, the sideno is the nodeid which can be any number,
 * so the index passed in is the index into the mnsuffixes array
 * in the locator structure.  This index was computed by the
 * routine checklocator which basically checked the locator block
 * mnside locator structure.
 */
static int
splitname2locatorblock(
    md_splitname    *spn,
    mddb_ln_t   *lnp,
    int     li,
    side_t      sideno,
    int     index
)
{
    uchar_t         i;
    md_name_suffix      *sn;
    md_mnname_suffix_t  *mnsn;
    mddb_mnln_t     *mnlnp;

    for (i = 0; i < MDDB_PREFIXCNT; i++) {
        if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
            continue;
        if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
            SPN_PREFIX(spn).pre_len) == 0)
            break;
    }
    if (i == MDDB_PREFIXCNT) {
        for (i = 0; i < MDDB_PREFIXCNT; i++) {
            if (lnp->ln_prefixes[i].pre_len == 0)
                break;
        }
        if (i == MDDB_PREFIXCNT)
            return (1);
        bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
            SPN_PREFIX(spn).pre_len);
        lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
    }

    if (lnp->ln_revision == MDDB_REV_MNLN) {
        /* If a MN diskset, use index */
        mnlnp = (mddb_mnln_t *)lnp;
        mnsn = &mnlnp->ln_mnsuffixes[index][li];
        mnsn->mn_ln_sideno = sideno;
        mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
        mnsn->mn_ln_suffix.suf_prefix = i;
        bcopy(SPN_SUFFIX(spn).suf_data,
            mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
    } else {
        sn = &lnp->ln_suffixes[sideno][li];
        sn->suf_len = SPN_SUFFIX(spn).suf_len;
        sn->suf_prefix = i;
        bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
            SPN_SUFFIX(spn).suf_len);
    }
    return (0);
}

/*
 * Find the locator name for the given sideno and convert the locator name
 * information into a splitname structure.
 */
void
mddb_locatorblock2splitname(
    mddb_ln_t   *lnp,
    int     li,
    side_t      sideno,
    md_splitname    *spn
)
{
    int         iprefix;
    md_name_suffix      *sn;
    md_mnname_suffix_t  *mnsn;
    int         i;
    mddb_mnln_t     *mnlnp;

    if (lnp->ln_revision == MDDB_REV_MNLN) {
        mnlnp = (mddb_mnln_t *)lnp;
        for (i = 0; i < MD_MNMAXSIDES; i++) {
            mnsn = &mnlnp->ln_mnsuffixes[i][li];
            if (mnsn->mn_ln_sideno == sideno)
                break;
        }
        if (i == MD_MNMAXSIDES)
            return;

        SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
        bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
            SPN_SUFFIX(spn).suf_len);
        iprefix = mnsn->mn_ln_suffix.suf_prefix;
    } else {
        sn = &lnp->ln_suffixes[sideno][li];
        SPN_SUFFIX(spn).suf_len = sn->suf_len;
        bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
            SPN_SUFFIX(spn).suf_len);
        iprefix = sn->suf_prefix;
    }
    SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
    bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
        SPN_PREFIX(spn).pre_len);
}

static int
getdeldev(
    mddb_config_t   *cp,
    int     command,
    md_error_t  *ep
)
{
    mddb_set_t  *s;
    mddb_lb_t   *lbp;
    mddb_locator_t  *locators;
    uint_t      loccnt;
    mddb_mb_ic_t    *mbip;
    mddb_block_t    blk;
    int     err = 0;
    int     i, j;
    int     li;
    uint_t      commitcnt;
    set_t       setno = cp->c_setno;
    uint_t      set_status;
    md_dev64_t  dev;
    int     flags = MDDB_MUSTEXIST;
    mddb_ri_t   *rip;

    cp->c_dbmax = MDDB_NLB;

    /*
     * Data checking
     */
    if (setno >= md_nsets || cp->c_id < 0 ||
        cp->c_id > cp->c_dbmax) {
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
    }

    if (cp->c_flags & MDDB_C_STALE)
        flags |= MDDB_MN_STALE;

    if ((s = mddb_setenter(setno, flags, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    cp->c_flags = 0;

    lbp = s->s_lbp;
    loccnt = lbp->lb_loccnt;
    locators = lbp->lb_locators;

    /* shorthand */
    set_status = md_get_setstatus(setno);

    if (set_status & MD_SET_STALE)
        cp->c_flags |= MDDB_C_STALE;

    if (set_status & MD_SET_TOOFEW)
        cp->c_flags |= MDDB_C_TOOFEW;

    cp->c_sideno = s->s_sideno;

    cp->c_dbcnt = 0;
    /*
     * go through and count active entries
     */
    for (i = 0; i < loccnt;  i++) {
        if (locators[i].l_flags & MDDB_F_DELETED)
            continue;
        cp->c_dbcnt++;
    }

    /*
     * add the ability to accept a locator block index
     * which is not relative to previously deleted replicas.  This
     * is for support of MD_DEBUG=STAT in metastat since it asks for
     * replica information specifically for each of the mirror resync
     * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
     * the mddb_config_t type.
     */
    if (cp->c_subcmd == MDDB_CONFIG_ABS) {
        if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
                setno));
        }
        li = cp->c_id;
    } else {
        if (cp->c_id >= cp->c_dbcnt) {
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
                setno));
        }

        /* CSTYLED */
        for (li = 0, j = 0; /* void */; li++) {
            if (locators[li].l_flags & MDDB_F_DELETED)
                continue;
            j++;
            if (j > cp->c_id)
                break;
        }
    }

    if (command == MDDB_ENDDEV) {
        daddr_t ib = 0, jb;

        blk = 0;
        if ((s != NULL) && s->s_mbiarray[li]) {
            mbip = s->s_mbiarray[li];
            while ((jb = getphysblk(blk++, mbip)) > 0) {
                if (jb > ib)
                    ib = jb;
            }
            cp->c_dbend = (int)ib;
        } else {
            cp->c_dbend = 0;
        }
    }

    locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
    mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);

    if (command != MDDB_DELDEV) {
        mddb_setexit(s);
        return (0);
    }

    /* Currently don't allow addition/deletion of sides during upgrade */
    if (MD_UPGRADE) {
        cmn_err(CE_WARN,
            "Deletion of replica not allowed during upgrade.\n");
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
    }

    /*
     * If here, replica delete in progress.
     */
    single_thread_start(s);

    if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
        (locators[li].l_flags & MDDB_F_ACTIVE)) {
        commitcnt = lbp->lb_commitcnt;
        lbp->lb_commitcnt = 0;
        setidentifier(s, &lbp->lb_ident);
        crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
        /*
         * Don't need to write out device id area, since locator
         * block on this replica is being deleted by setting the
         * commitcnt to 0.
         */
        (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
            MDDB_WR_ONLY_MASTER);
        lbp->lb_commitcnt = commitcnt;
    }

    if (s->s_mbiarray[li]) {
        /* A freed mbi pointer still exists in the mddb_ri_t */
        for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
            if (rip->ri_mbip == s->s_mbiarray[li])
                rip->ri_mbip = NULL;
        }
        free_mbipp(&s->s_mbiarray[li]);
    }

    if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
        dev = md_expldev(locators[li].l_dev);
        dev = md_xlate_targ_2_mini(dev);
        if (dev != NODEV64)
            mddb_devclose(dev);
    }

    s->s_mbiarray[li] = 0;
    lbp->lb_locators[li].l_flags = MDDB_F_DELETED;

    /* Only support data tags for traditional and local sets */
    if ((md_get_setstatus(setno) & MD_SET_STALE) &&
        (!(lbp->lb_flags & MDDB_MNSET)) &&
        setno != MD_LOCAL_SET)
        if (set_dtag(s, ep))
            mdclrerror(ep);

    /* Write data tags to all accessible devices */
    /* Only support data tags for traditional and local sets */
    if (!(lbp->lb_flags & MDDB_MNSET)) {
        (void) dt_write(s);
    }

    /* Delete device id of deleted replica */
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        (void) mddb_devid_delete(s, li);
    }
    /* write new locator to all devices */
    err = writelocall(s);

    (void) upd_med(s, "getdeldev(0)");

    SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
        md_expldev(locators[li].l_dev));

    computefreeblks(s); /* recompute always it may be larger */
    cp->c_dbcnt--;
    err |= fixoptrecords(s);
    if (err) {
        if (writeretry(s)) {
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
        }
    }

    single_thread_end(s);
    mddb_setexit(s);
    return (0);
}

static int
getdriver(
    mddb_cfg_loc_t  *clp
)
{
    major_t     majordev;

    /*
     * Data checking
     */
    if (clp->l_dev <= 0)
        return (EINVAL);

    majordev = getmajor(expldev(clp->l_dev));

    if (ddi_major_to_name(majordev) == (char *)NULL)
        return (EINVAL);

    if (MD_UPGRADE)
        (void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
    else
        (void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
    return (0);
}

/*
 * update_valid_replica - updates the locator block namespace (prefix
 *  and/or suffix) with new pathname and devname.
 *  RETURN
 *      1   Error
 *      0   Success
 */
static int
update_valid_replica(
    side_t      side,
    mddb_locator_t  *lp,
    mddb_set_t  *s,
    int     li,
    char        *devname,
    char        *pathname,
    md_dev64_t  devt
)
{
    uchar_t     pre_len, suf_len;
    md_name_suffix  *sn;
    mddb_ln_t   *lnp;
    uchar_t     pre_index;
    uchar_t     i;

    if (md_expldev(lp->l_dev) != devt) {
        return (0);
    }

    if (pathname[strlen(pathname) - 1] == '/')
        pathname[strlen(pathname) - 1] = '\0';

    pre_len = (uchar_t)strlen(pathname);
    suf_len = (uchar_t)strlen(devname);

    if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
        return (1);

    lnp = s->s_lnp;

    /*
     * Future note:  Need to do something here for the MN diskset case
     * when device ids are supported in disksets.
     * Can't add until merging devids_in_diskset code into code base
     * Currently only called with side of 0.
     */

    sn = &lnp->ln_suffixes[side][li];

    /*
     * Check if prefix (Ex: /dev/dsk) needs to be changed.
     * If new prefix is the same as the previous prefix - no change.
     *
     * If new prefix is not the same, check if new prefix
     * matches an existing one.  If so, use that one.
     *
     * If new prefix doesn't exist, add a new prefix.  If not enough
     * space, return failure.
     */
    pre_index = sn->suf_prefix;
    /* Check if new prefix is the same as the old prefix. */
    if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
        (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
        pre_len) != 0)) {
        /* Check if new prefix is an already known prefix. */
        for (i = 0; i < MDDB_PREFIXCNT; i++) {
            if (lnp->ln_prefixes[i].pre_len != pre_len) {
                continue;
            }
            if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
                pre_len) == 0) {
                break;
            }
        }
        /* If no match found for new prefix - add the new prefix */
        if (i == MDDB_PREFIXCNT) {
            for (i = 0; i < MDDB_PREFIXCNT; i++) {
                if (lnp->ln_prefixes[i].pre_len == 0)
                    break;
            }
            /* No space to add new prefix - return failure */
            if (i == MDDB_PREFIXCNT) {
                return (1);
            }
            bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
            lnp->ln_prefixes[i].pre_len = pre_len;
        }
        sn->suf_prefix = i;
    }

    /* Now, update the suffix (Ex: c0t0d0s0) if needed */
    if ((sn->suf_len != suf_len) ||
        (bcmp(sn->suf_data, devname, suf_len) != 0)) {
        bcopy(devname, sn->suf_data, suf_len);
        sn->suf_len = suf_len;
    }
    return (0);
}


/*
 * md_update_locator_namespace - If in devid style and active and the devid's
 *      exist and are valid update the locator namespace pathname
 *      and devname.
 *  RETURN
 *      1   Error
 *      0   Success
 */
int
md_update_locator_namespace(
    set_t       setno,      /* which set to get name from */
    side_t      side,
    char        *dname,
    char        *pname,
    md_dev64_t  devt
)
{
    mddb_set_t  *s;
    mddb_lb_t   *lbp;
    int     li;
    uint_t      flg;
    int     err = 0;
    mddb_ln_t   *lnp;

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (1);
    single_thread_start(s);
    lbp = s->s_lbp;
    /* must be DEVID_STYLE */
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        for (li = 0; li < lbp->lb_loccnt; li++) {
            mddb_locator_t *lp = &lbp->lb_locators[li];

            if (lp->l_flags & MDDB_F_DELETED) {
                continue;
            }

            /* replica also must be active */
            if (lp->l_flags & MDDB_F_ACTIVE) {
                flg = s->s_did_icp->did_ic_blkp->
                    blk_info[li].info_flags;
                /* only update if did exists and is valid */
                if ((flg & MDDB_DID_EXISTS) &&
                    (flg & MDDB_DID_VALID)) {
                    if (update_valid_replica(side, lp, s,
                        li, dname, pname, devt)) {
                        err = 1;
                        goto out;
                    }
                }
            }
        }
    }
    lnp = s->s_lnp;
    uniqtime32(&lnp->ln_timestamp);
    if (lbp->lb_flags & MDDB_MNSET)
        lnp->ln_revision = MDDB_REV_MNLN;
    else
        lnp->ln_revision = MDDB_REV_LN;
    crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
    err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
        lbp->lb_lnblkcnt, 0);
    /*
     * If a MN diskset and this is the master, set the PARSE_LOCNM
     * flag in the mddb_set structure to show that the locator
     * names have changed.
     */

    if ((lbp->lb_flags & MDDB_MNSET) &&
        (md_set[s->s_setno].s_am_i_master)) {
        s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
    }
out:
    single_thread_end(s);
    mddb_setexit(s);
    if (err)
        return (1);
    return (0);
}

/*
 * update_locatorblock - for active entries in the locator block, check
 *      the devt to see if it matches the given devt. If so, and
 *      there is an associated device id which is not the same
 *      as the passed in devid, delete old devid and add a new one.
 *
 *      During import of replicated disksets, old_didptr contains
 *      the original disk's device id.  Use this device id in
 *      addition to the devt to determine if an entry is a match
 *      and should be updated with the new device id of the
 *      replicated disk.  Specifically, this is the case being handled:
 *
 *      Original_disk   Replicated_disk Disk_Available_During_Import
 *      c1t1d0      c1t3d0      no - so old name c1t1d0 shown
 *      c1t2d0      c1t1d0      yes - name is c1t1d0
 *      c1t3d0      c1t2d0      yes - name is c1t2d0
 *
 *      Can't just match on devt since devt for the first and third
 *      disks will be the same, but the original disk's device id
 *      is known and can be used to distinguish which disk's
 *      replicated device id should be updated.
 *  RETURN
 *      MDDB_E_NODEVID
 *      MDDB_E_NOLOCBLK
 *      1   Error
 *      0   Success
 */
static int
update_locatorblock(
    mddb_set_t  *s,
    md_dev64_t  dev,
    ddi_devid_t didptr,
    ddi_devid_t old_didptr
)
{
    mddb_lb_t   *lbp = NULL;
    mddb_locator_t  *lp;
    int     li;
    uint_t      flg;
    ddi_devid_t devid_ptr;
    int     retval = 0;
    char        *minor_name;
    int     repl_import_flag;

    /* Set replicated flag if this is a replicated import */
    repl_import_flag = md_get_setstatus(s->s_setno) &
        MD_SET_REPLICATED_IMPORT;

    lbp = s->s_lbp;
    /* find replicas that haven't been deleted */
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];

        if ((lp->l_flags & MDDB_F_DELETED)) {
            continue;
        }
        /*
         * check to see if locator devt matches given dev
         * and if there is a device ID associated with it
         */
        flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
        if ((md_expldev(lp->l_dev) == dev) &&
            (flg & MDDB_DID_EXISTS)) {
            if (flg & MDDB_DID_VALID) {
                continue; /* cont to nxt active entry */
            }
            devid_ptr = s->s_did_icp->did_ic_devid[li];
            if (devid_ptr == NULL) {
                return (MDDB_E_NODEVID);
            }

            /*
             * During a replicated import the old_didptr
             * must match the current devid before the
             * devid can be updated.
             */
            if (repl_import_flag) {
                if (ddi_devid_compare(devid_ptr,
                    old_didptr) != 0)
                    continue;
            }

            if (ddi_devid_compare(devid_ptr, didptr) != 0) {
                /*
                 * devid's not equal so
                 * delete and add
                 */
                if (ddi_lyr_get_minor_name(
                    md_dev64_to_dev(dev),
                    S_IFBLK, &minor_name) == DDI_SUCCESS) {
                    (void) mddb_devid_delete(s, li);
                    (void) mddb_devid_add(s, li, didptr,
                        minor_name);
                    kmem_free(minor_name,
                        strlen(minor_name)+1);
                    break;
                } else {
                    retval = 1;
                    goto err_out;
                }
            }
        }
    } /* end for */
    retval = push_lb(s);
    (void) upd_med(s, "update_locatorblock(0)");
err_out:
    return (retval);
}

static int
update_mb_devid(
    mddb_set_t  *s,
    mddb_ri_t   *rip,
    ddi_devid_t devidptr
)
{
    mddb_mb_ic_t    *mbip;
    mddb_mb_t   *mb = NULL;
    daddr_t     blkno;
    md_dev64_t  device;
    uint_t      sz;
    int     mb2free = 0;
    int     err = 0;


    /*
     * There is case where a disk may not have mddb,
     * and only has dummy mddb which contains
     * a valid devid we like to update and in this
     * case, the rip_lbp will be NULL but we still
     * like to update the devid embedded in the
     * dummy mb block.
     *
     */
    if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
        mbip = rip->ri_mbip;
        mb = &mbip->mbi_mddb_mb;
    } else {
        /*
         * Done if it is non-replicated set
         */
        if (devidptr != (ddi_devid_t)NULL) {
            mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
                KM_SLEEP);
            mb->mb_magic = MDDB_MAGIC_DU;
            mb->mb_revision = MDDB_REV_MB;
            mb2free = 1;
        } else {
            goto out;
        }
    }

    blkno = rip->ri_blkno;
    device = rip->ri_dev;
    /*
     * Replace the mb_devid with the new/valid one
     */
    if (devidptr != (ddi_devid_t)NULL) {
        /*
         * Zero out what we have previously
         */
        if (mb->mb_devid_len)
            bzero(mb->mb_devid, mb->mb_devid_len);
        sz = ddi_devid_sizeof(devidptr);
        bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
        mb->mb_devid_len = sz;
    }

    mb->mb_setno = s->s_setno;
    uniqtime32(&mb->mb_timestamp);
    crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
    /*
     * putblks will
     *
     *  - drop the s_dbmx lock
     *  - biowait
     *  - regain the s_dbmx lock
     *
     * Need to update this if we wants to handle
     * mb_next != NULL which it is unlikely will happen
     */
    err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);

    if (mb2free) {
        kmem_free(mb, MDDB_BSIZE);
    }
out:
    return (err);
}

static int
setdid(
    mddb_config_t       *cp
)
{
    ddi_devid_t     devidp;
    dev_t           ddi_dev;
    mddb_set_t      *s;
    int         err = 0;
    mddb_ri_t       *rip;

    /*
     * Data integrity check
     */
    if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
        return (EINVAL);

    if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
        return (0);

    ddi_dev = md_dev64_to_dev(cp->c_devt);
    if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
        return (-1);
    }
    if (devidp == NULL) {
        return (-1);
    }

    if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (-1);
    single_thread_start(s);

    for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
        if (rip->ri_lbp == (mddb_lb_t *)NULL)
            continue;
        /*
         * We only update what is asked
         */
        if (rip->ri_dev == cp->c_devt) {
            if (update_mb_devid(s, rip, devidp) != 0) {
                err = -1;
                goto out;
            }
        }
    }

    if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
        err = -1;
        goto out;
    }

out:
    single_thread_end(s);
    mddb_setexit(s);
    ddi_devid_free(devidp);
    return (err);
}

static int
delnewside(
    mddb_config_t       *cp,
    int         command,
    md_error_t      *ep
)
{
    mddb_set_t      *s;
    int         li;
    mddb_lb_t       *lbp;       /* pointer to locator block */
    mddb_ln_t       *lnp;       /* pointer to locator names */
    mddb_mnln_t     *mnlnp;     /* pointer to locator names */
    mddb_locator_t      *lp;
    mddb_sidelocator_t  *slp;
    mddb_cfg_loc_t      *clp;
    int         err = 0;
    set_t           setno = cp->c_setno;
    ddi_devid_t     devid;
    ddi_devid_t     ret_devid = NULL;
    char            *minor_name;
    uint_t          use_devid = 0;
    dev_t           ddi_dev;
    md_mnname_suffix_t  *mnsn;
    mddb_mnlb_t     *mnlbp;
    mddb_mnsidelocator_t    *mnslp;

    /* Currently don't allow addition/deletion of sides during upgrade */
    if (MD_UPGRADE) {
        cmn_err(CE_WARN,
            "Addition and deletion of sides not allowed"
            " during upgrade. \n");
        return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
    }

    /*
     * Data integrity check
     */
    if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    single_thread_start(s);
    clp = &cp->c_locator;

    lbp = s->s_lbp;

    if (lbp->lb_setno != setno) {
        single_thread_end(s);
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
    }

    /*
     * Find this device/blkno pair
     */
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        ddi_dev = md_dev64_to_dev(clp->l_dev);
        if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
            (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
            == DDI_SUCCESS)) {
            if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
                clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
                use_devid = 1;
                (void) strcpy(clp->l_minor_name, minor_name);
            }
            kmem_free(minor_name, strlen(minor_name)+1);
        }
        if (use_devid != 1 && ret_devid != NULL)
            ddi_devid_free(ret_devid);
    }
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        if (use_devid) {
            if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
                continue;
            if ((ddi_devid_compare(devid,
                (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
                (strcmp(clp->l_minor_name, minor_name) == 0) &&
                ((daddr_t)lp->l_blkno == clp->l_blkno)) {
                break;
            }
        } else {
            if (lp->l_dev == clp->l_dev &&
                (daddr_t)lp->l_blkno == clp->l_blkno) {
                break;
            }
        }
    }

    if (li == lbp->lb_loccnt) {
        if (use_devid)
            ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
        single_thread_end(s);
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
    }

    lnp = s->s_lnp;
    if (command == MDDB_NEWSIDE) {
        int     index = 0;
        /*
         * If a MN diskset, need to find the index where the new
         * locator information is to be stored in the mnsidelocator
         * field of the locator block so that the locator name can
         * be stored at the same array index in the mnsuffixes
         * field of the locator names structure.
         */
        if (lbp->lb_flags & MDDB_MNSET) {
            if ((index = checklocator(lbp, li,
                cp->c_sideno)) == -1) {
                if (use_devid) {
                    ddi_devid_free((ddi_devid_t)
                        (uintptr_t)clp->l_devid);
                }
                single_thread_end(s);
                mddb_setexit(s);
                return (mdmddberror(ep, MDE_DB_TOOSMALL,
                    NODEV32, setno));
            }
        }

        /*
         * Store the locator name before the sidelocator information
         * in case a panic occurs between these 2 steps.  Must have
         * the locator name information in order to print reasonable
         * error information.
         */
        if (splitname2locatorblock(&cp->c_devname, lnp, li,
            cp->c_sideno, index)) {
            if (use_devid)
                ddi_devid_free(
                    (ddi_devid_t)(uintptr_t)clp->l_devid);
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
                setno));
        }

        if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
            if (use_devid)
                ddi_devid_free(
                    (ddi_devid_t)(uintptr_t)clp->l_devid);
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
                setno));
        }
    }

    if (use_devid)
        ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);

    if (command == MDDB_DELSIDE) {
        int i;
        for (i = 0; i < lbp->lb_loccnt; i++) {
            if (lbp->lb_flags & MDDB_MNSET) {
                int j;
                mnlbp = (mddb_mnlb_t *)lbp;
                for (j = 0; j < MD_MNMAXSIDES; j++) {
                    mnslp = &mnlbp->lb_mnsidelocators[j][i];
                    if (mnslp->mnl_sideno == cp->c_sideno)
                        break;
                }
                if (j < MD_MNMAXSIDES) {
                    mnslp->mnl_mnum = NODEV32;
                    mnslp->mnl_sideno = 0;
                    mnlnp = (mddb_mnln_t *)lnp;
                    mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
                    bzero((caddr_t)mnsn,
                        sizeof (md_mnname_suffix_t));
                }
            } else {
                slp = &lbp->lb_sidelocators[cp->c_sideno][i];
                bzero((caddr_t)&lnp->ln_suffixes
                    [cp->c_sideno][i], sizeof (md_name_suffix));
                slp->l_mnum = NODEV32;
            }
        }
    }

    /* write new locator names to all devices */
    uniqtime32(&lnp->ln_timestamp);
    if (lbp->lb_flags & MDDB_MNSET)
        lnp->ln_revision = MDDB_REV_MNLN;
    else
        lnp->ln_revision = MDDB_REV_LN;
    crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
    err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
        lbp->lb_lnblkcnt, 0);
    /*
     * If a MN diskset and this is the master, set the PARSE_LOCNM
     * flag in the mddb_set structure to show that the locator
     * names have changed.
     */

    if ((lbp->lb_flags & MDDB_MNSET) &&
        (md_set[s->s_setno].s_am_i_master)) {
        s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
    }
    if (err) {
        if (writeretry(s)) {
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
        }
    }

    uniqtime32(&lbp->lb_timestamp);
    /* write new locator to all devices */
    err = writelocall(s);

    (void) upd_med(s, "delnewside(0)");

    computefreeblks(s); /* recompute always it may be larger */
    if (err) {
        if (writeretry(s)) {
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
        }
    }

    single_thread_end(s);
    mddb_setexit(s);

    return (0);
}

static int
newdev(
    mddb_config_t   *cp,
    int     command,
    md_error_t  *ep
)
{
    mddb_set_t  *s;
    mddb_mb_ic_t    *mbip, *mbip1;
    int     i, j;
    int     li;
    mddb_lb_t   *lbp;       /* pointer to locator block */
    mddb_ln_t   *lnp;       /* pointer to locator names */
    mddb_locator_t  *lp;
    mddb_cfg_loc_t  *clp;
    int     err = 0;
    set_t       setno = cp->c_setno;
    ddi_devid_t devid2;
    ddi_devid_t ret_devid = NULL;
    char        *minor_name;
    uint_t      use_devid = 0;
    dev_t       ddi_dev;
    int     old_flags;
    int     flags;
    int     mn_set = 0;
    int     index;
    mddb_ri_t   *rip;
    int     locator_deleted = 0;
    dev32_t     locator_deleted_dev;
    int     sz = 0;


    /* Currently don't allow addition of new replica during upgrade */
    if (MD_UPGRADE) {
        cmn_err(CE_WARN,
            "Addition of new replica not allowed during upgrade.\n");
        return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
    }

    /*
     * Data integrity check
     */
    if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    /* Determine the flag settings for multinode sets */
    flags = MDDB_NOOLDOK;
    if (cp->c_multi_node)
        flags |= MDDB_MULTINODE;

    if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
        if (err != MDDB_E_NOTOWNER)
            return (mddbstatus2error(ep, err, NODEV32, setno));
        s = init_set(cp, flags, &err);
        if (s == NULL)
            return (mddbstatus2error(ep, err, NODEV32, setno));
    }

    single_thread_start(s);

    /* shorthand */
    clp = &cp->c_locator;

    /* shorthand */
    lbp = s->s_lbp;

    if (lbp->lb_setno != setno) {
        single_thread_end(s);
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
    }

    /*
     * See if this device/blkno pair is already a replica
     */
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        ddi_dev = expldev(clp->l_dev);
        if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
            (ddi_lyr_get_minor_name(ddi_dev,
            S_IFBLK, &minor_name) == DDI_SUCCESS)) {
            if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
                clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
                use_devid = 1;
                (void) strcpy(clp->l_minor_name, minor_name);
            }
            kmem_free(minor_name, strlen(minor_name)+1);
        }
        if (use_devid != 1 && ret_devid != NULL)
            ddi_devid_free(ret_devid);
    }

    for (i = 0; i < lbp->lb_loccnt;  i++) {
        lp = &lbp->lb_locators[i];
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        if (use_devid) {
            if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
                continue;
            if ((ddi_devid_compare(devid2,
                (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
                (strcmp(clp->l_minor_name, minor_name) == 0) &&
                ((daddr_t)lp->l_blkno == clp->l_blkno)) {
                if (command == MDDB_NEWDEV) {
                    ddi_devid_free((ddi_devid_t)(uintptr_t)
                        clp->l_devid);
                    single_thread_end(s);
                    mddb_setexit(s);
                    return (mdmddberror(ep,
                        MDE_DB_EXISTS, NODEV32, setno));
                }
            }
        } else {
            if (lp->l_dev == clp->l_dev &&
                (daddr_t)lp->l_blkno == clp->l_blkno) {
                if (command == MDDB_NEWDEV) {
                    single_thread_end(s);
                    mddb_setexit(s);
                    return (mdmddberror(ep,
                        MDE_DB_EXISTS, NODEV32, setno));
                }
            }
        }
    }

    /*
     * Really is a new replica, go get the master blocks
     */
    mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
        (uint_t *)0, &mn_set);
    if (! mbip) {
        if (use_devid)
            ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
        single_thread_end(s);
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
    }

    /*
     * Compute free blocks in replica.
     */
    computefreeblks(s);

    /*
     * Check if this is large enough
     */
    for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
        i += mbip1->mbi_mddb_mb.mb_blkcnt;
    for (j = i; j < s->s_totalblkcnt; j++) {
        if (blkcheck(s, j)) {
            while (mbip) {
                mbip1 = mbip->mbi_next;
                kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
                mbip = mbip1;
            }
            if (use_devid)
                ddi_devid_free(
                    (ddi_devid_t)(uintptr_t)clp->l_devid);
            mddb_devclose(md_expldev(clp->l_dev));
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
                setno));
        }
    }

    /* Look for a deleted slot */
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_DELETED) {
            locator_deleted = 1;
            locator_deleted_dev = lp->l_dev;
            break;
        }
    }

    /* If no deleted slots, add a new one */
    if (li == lbp->lb_loccnt) {
        /* Already have the max replicas, bail */
        if (lbp->lb_loccnt == MDDB_NLB) {
            if (use_devid)
                ddi_devid_free((ddi_devid_t)(uintptr_t)
                    clp->l_devid);
            mddb_devclose(md_expldev(clp->l_dev));
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
                setno));
        }
        lbp->lb_loccnt++;
        lp = &lbp->lb_locators[li];
    }

    /* Initialize the new or deleted slot */
    old_flags = lp->l_flags;
    lp->l_dev = clp->l_dev;
    lp->l_blkno = (daddr32_t)clp->l_blkno;
    lp->l_flags = clp->l_flags;

    /* shorthand */
    lnp = s->s_lnp;

    index = 0;
    if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
        /*
         * If a MN diskset, need to find the index where the new
         * locator information is to be stored in the mnsidelocator
         * field of the locator block so that the locator name can
         * be stored at the same array index in the mnsuffixes
         * field of the locator names structure.
         */
        lbp->lb_flags |= MDDB_MNSET;
        if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
            if (use_devid)
                ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
                    l_devid);
            lp->l_flags = old_flags;
            lbp->lb_loccnt--;
            mddb_devclose(md_expldev(clp->l_dev));
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_TOOSMALL,
                NODEV32, setno));
        }
    }
    /*
     * Store the locator name before the sidelocator information
     * in case a panic occurs between these 2 steps.  Must have
     * the locator name information in order to print reasonable
     * error information.
     */
    if (splitname2locatorblock(&cp->c_devname, lnp, li,
        s->s_sideno, index)) {
        if (use_devid)
            ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
        lp->l_flags = old_flags;
        lbp->lb_loccnt--;
        mddb_devclose(md_expldev(clp->l_dev));
        single_thread_end(s);
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
    }

    /*
     * Compute free blocks in replica before calling cfgloc2locator
     * since cfgloc2locator may attempt to alloc an unused block
     * to store the device id.
     * mbiarray needs to be setup before calling computefreeblks.
     */
    s->s_mbiarray[li] = mbip;
    computefreeblks(s);

    if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
        if (use_devid)
            ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
        lp->l_flags = old_flags;
        lbp->lb_loccnt--;
        s->s_mbiarray[li] = 0;
        mddb_devclose(md_expldev(clp->l_dev));
        single_thread_end(s);
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
    }

    /*
     * Hijack a deleted rip master record and correct the contents
     */
    if (locator_deleted) {
        for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
            if (rip->ri_lbp != NULL &&
                rip->ri_mbip == 0 &&
                (rip->ri_dev == md_expldev(locator_deleted_dev))) {
                rip->ri_dev = md_expldev(clp->l_dev);
                rip->ri_mbip = mbip;

                if (use_devid && clp->l_devid != 0) {
                    sz = (int)ddi_devid_sizeof(
                        (ddi_devid_t)(uintptr_t)
                        clp->l_devid);
                    rip->ri_devid =
                        (ddi_devid_t)kmem_zalloc(sz,
                        KM_SLEEP);
                    bcopy((void *)(uintptr_t)clp->l_devid,
                        (char *)rip->ri_devid, sz);
                }

                break;
            }
        }
    }

    if (use_devid)
        ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);

    uniqtime32(&lbp->lb_timestamp);
    lp->l_flags = MDDB_F_ACTIVE;

    /* write db copy to new device */
    err = writecopy(s, li, MDDB_WRITECOPY_ALL);
    lp->l_flags |= MDDB_F_UP2DATE;

    /* write new locator names to all devices */
    uniqtime32(&lnp->ln_timestamp);
    if (lbp->lb_flags & MDDB_MNSET)
        lnp->ln_revision = MDDB_REV_MNLN;
    else
        lnp->ln_revision = MDDB_REV_LN;
    crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
    err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
        lbp->lb_lnblkcnt, 0);
    /*
     * If a MN diskset and this is the master, set the PARSE_LOCNM
     * flag in the mddb_set structure to show that the locator
     * names have changed.
     */

    if ((lbp->lb_flags & MDDB_MNSET) &&
        (md_set[s->s_setno].s_am_i_master)) {
        s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
    }
    if (err) {
        if (writeretry(s)) {
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
        }
    }

    /* Data tags not supported on MN sets */
    if ((md_get_setstatus(setno) & MD_SET_STALE) &&
        (!(lbp->lb_flags & MDDB_MNSET)) &&
        setno != MD_LOCAL_SET)
        if (set_dtag(s, ep))
            mdclrerror(ep);

    /* Write data tags to all accessible devices */
    /* Data tags not supported on MN sets */
    if (!(lbp->lb_flags & MDDB_MNSET)) {
        (void) dt_write(s);
    }

    /* write new locator to all devices */
    err = writelocall(s);

    (void) upd_med(s, "newdev(0)");

    SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
        md_expldev(clp->l_dev));

    computefreeblks(s); /* recompute always it may be smaller */
    if (err) {
        if (writeretry(s)) {
            single_thread_end(s);
            mddb_setexit(s);
            return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
        }
    }

    single_thread_end(s);
    mddb_setexit(s);

    return (0);
}

#ifdef DEBUG
static void
mddb_check_set(
    set_t   setno
)
{
    mddb_set_t  *s;
    mddb_db_t   *dbp;
    mddb_de_ic_t    *dep;
    mddb_rb32_t *rbp;

    if (! md_set[setno].s_db)
        return;

    s = (mddb_set_t *)md_set[setno].s_db;

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            rbp = dep->de_rb;
            ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
            if (dep->de_rb_userdata)
                ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
        }
    }
}
#endif /* DEBUG */

/*
 * Exported Entry Points
 */
#ifdef DEBUG
void
mddb_check(void)
{
    int i;

    for (i = 0; i < md_nsets; i++) {
        if (! md_set[i].s_db)
            return;

        mddb_check_set(i);
    }

}
#endif /* DEBUG */

int
mddb_configure(
    mddb_cfgcmd_t   command,
    mddb_config_t   *cp
)
{
    mddb_set_t  *s;
    md_error_t  *ep = &cp->c_mde;
    int     flag = 0;
    int     err = 0;
    set_t       setno = cp->c_setno;

    mdclrerror(ep);

    switch (command) {
        case MDDB_NEWDEV:
            err = newdev(cp, command, ep);
            break;

        case MDDB_NEWSIDE:
        case MDDB_DELSIDE:
            err = delnewside(cp, command, ep);
            break;

        case MDDB_GETDEV:
        case MDDB_DELDEV:
        case MDDB_ENDDEV:
            err = getdeldev(cp, command, ep);
            break;

        case MDDB_GETDRVRNAME:
            err = getdriver(&cp->c_locator);
            break;

        case MDDB_USEDEV:
            /*
             * Note: must allow USEDEV ioctl during upgrade to
             * support auto-take disksets.
             *
             * Also during the set import if the md_devid_destroy
             * flag is set then error out
             */

            if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
                return (mdmderror(ep, MDE_INVAL_UNIT,
                    MD_ADM_MINOR));

            if (setno >= md_nsets)
                return (mdmderror(ep, MDE_INVAL_UNIT,
                    MD_ADM_MINOR));

            if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
                NULL) {
                if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
                    NULL) {
                    err = mddbstatus2error(ep, err,
                        NODEV32, setno);
                    break;
                }
            }
            if (setno == MD_LOCAL_SET)
                flag = MDDB_F_IOCTL;
            if (cp->c_locator.l_old_devid) {
                md_set_setstatus(setno,
                    MD_SET_REPLICATED_IMPORT);
            }
            err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
            mddb_setexit(s);
            break;

        case MDDB_RELEASESET:
            mutex_enter(&mddb_lock);
            mddb_unload_set(cp->c_setno);
            mutex_exit(&mddb_lock);
            break;

        case MDDB_SETDID:
            err = setdid(cp);
            break;

        default:
            err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
                cp->c_setno);
    }

    return (err);
}

int
mddb_getoptloc(
    mddb_optloc_t       *ol
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    mddb_recid_t        id;
    set_t           setno;

    ol->li[0] = -1;
    ol->li[1] = -1;

    id = ol->recid;
    setno = DBSET(id);
    if (setno >= md_nsets)
        return (EINVAL);

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
        return (0);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            ol->li[0] = dep->de_optinfo[0].o_li;
            ol->li[1] = dep->de_optinfo[1].o_li;
            mddb_setexit(s);
            return (0);
        }
    }
    mddb_setexit(s);
    return (0);
}

void
mddb_init(void)
{
    mddb_set_t  *s;

    mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
    if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
        mddb_setexit(s);
}


void
mddb_unload(void)
{
    int i;

    mutex_enter(&mddb_lock);

    for (i = 0; i < md_nsets; i++) {
        md_clr_setstatus(i, MD_SET_KEEPTAG);
        mddb_unload_set(i);
    }

    crcfreetab();

    mutex_exit(&mddb_lock);
}

mddb_recid_t
mddb_createrec(
    size_t      usersize,    /* size of db record */
    mddb_type_t type,        /* type1 of db record */
    uint_t      type2,       /* type2 of db record */
    md_create_rec_option_t  options, /* options for this creation  */
    set_t       setno        /* set number to create record in */
)
{
    mddb_set_t  *s;
    mddb_db_t   *dbp, *prevdbp, *newdbp;
    mddb_db32_t *db32p;
    mddb_de_ic_t    *dep;
    /* LINTED variable unused - used for sizeof calculations */
    mddb_de32_t *de32p;
    mddb_rb32_t *rbp;
    size_t      recsize;
    ulong_t     blkcnt;
    ulong_t     maxblocks;
    size_t      desize, desize_ic;
    size_t      used;
    mddb_recid_t    newid;
    caddr_t     tmppnt;
    int     i, err = 0;
    void        *userdata;
    uint_t      flag_type;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

    /*
     * everyone is supposed to sepcify if it's a
     * 32 bit or a 64 bit record
     */
    if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
        return (MDDB_E_INVALID);
    }

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (err);

    if (checkstate(s, MDDB_PROBE)) {
        mddb_setexit(s);
        return (MDDB_E_NOTNOW);
    }

    recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
        usersize, MDDB_BSIZE);
    blkcnt = btodb(recsize);

    if (mddb_maxblocks)
        maxblocks = mddb_maxblocks;
    else
        maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
            sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);

    if (blkcnt > maxblocks) {
        mddb_setexit(s);
        return (MDDB_E_INVALID);
    }
    /*
     * allocate record block
     * and new directory block so to avoid sleeping
     * after starting single_thread
     */
    rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
    if ((options & MD_CRO_OPTIMIZE) == 0)
        userdata = kmem_zalloc(usersize, KM_SLEEP);
    newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);

    /*
     * if this is the largest record allocate new buffer for
     * checkcopy();
     */
    if (recsize > s->s_databuffer_size) {
        tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
        /*
         * this test is incase when to sleep during kmem_alloc
         * and some other task bumped max record size
         */
        if (recsize > s->s_databuffer_size) {
            if (s->s_databuffer_size)
                kmem_free(s->s_databuffer,
                    s->s_databuffer_size);
            s->s_databuffer = tmppnt;
            s->s_databuffer_size = recsize;
        } else {
            kmem_free(tmppnt, recsize);
        }
    }

    single_thread_start(s);

    newid = 0;
    do {
        newid++;
        if (DBID(newid) == 0) {
            kmem_free((caddr_t)newdbp, sizeof (*newdbp));
            kmem_free((caddr_t)rbp, ((size_t)recsize));
            if ((options & MD_CRO_OPTIMIZE) == 0)
                kmem_free(userdata, usersize);
            single_thread_end(s);
            mddb_setexit(s);
            return (MDDB_E_NOTNOW);
        }

        for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
            for (dep = dbp->db_firstentry; dep;
                dep = dep->de_next) {
                if (dep->de_recid == newid)
                    break;
            }
            if (dep != NULL)
                break;
        }
    } while (dbp);

    desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
        (sizeof (mddb_block_t) * blkcnt);

    /*
     * see if a directory block exists which will hold this entry
     */
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        used = sizeof (*db32p);
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            used += sizeof (*de32p) - sizeof (de32p->de32_blks);
            used += sizeof (mddb_block_t) * dep->de_blkcount;
        }
        if ((used + desize) < MDDB_BSIZE)
            break;
    }
    if (dbp) {
        kmem_free((caddr_t)newdbp, sizeof (*newdbp));
        if (blkcnt > s->s_freeblkcnt) {
            kmem_free((caddr_t)rbp, ((size_t)recsize));
            if ((options & MD_CRO_OPTIMIZE) == 0)
                kmem_free(userdata, usersize);
            single_thread_end(s);
            mddb_setexit(s);
            return (MDDB_E_NOSPACE);
        }
        prevdbp = NULL;
    } else {
        /*
         * need to add directory block
         */
        if ((blkcnt + 1) > s->s_freeblkcnt) {
            kmem_free((caddr_t)newdbp, sizeof (*newdbp));
            kmem_free((caddr_t)rbp, ((size_t)recsize));
            if ((options & MD_CRO_OPTIMIZE) == 0)
                kmem_free(userdata, usersize);
            single_thread_end(s);
            mddb_setexit(s);
            return (MDDB_E_NOSPACE);
        }
        for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
            ;
        dbp->db_next = newdbp;
        bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
        dbp->db_nextblk = getfreeblks(s, 1);
        dbp->db_next->db_blknum = dbp->db_nextblk;
        prevdbp = dbp;
        dbp = dbp->db_next;
        dbp->db_nextblk = 0;
        dbp->db_firstentry = NULL;
        dbp->db_recsum = 0;
        dbp->db_magic = MDDB_MAGIC_DB;
    }
    /*
     * ready to add record
     */
    desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
        (sizeof (mddb_block_t) * blkcnt);
    if (dbp->db_firstentry) {
        for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
            ;
        dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
        dep = dep->de_next;
    } else {
        dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
        dbp->db_firstentry = dep;
    }
    bzero((caddr_t)dep, desize_ic);
    dep->de_recid = newid;
    /*
     * Optimized records have an owner node associated with them in
     * a MN diskset.  The owner is only set on a node that is actively
     * writing to that record.  The other nodes will show that record
     * as having an invalid owner.  The owner for an optimized record
     * is used during fixoptrecord to determine which node should
     * write out the record when the replicas associated with that
     * optimized record have been changed.
     */
    if (MD_MNSET_SETNO(s->s_setno)) {
        dep->de_owner_nodeid = MD_MN_INVALID_NID;
    }
    dep->de_type1 = type;
    dep->de_type2 = type2;
    dep->de_reqsize = usersize;
    dep->de_recsize = recsize;
    dep->de_blkcount = blkcnt;
    flag_type = options &
        (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
        MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
        MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
    switch (flag_type) {
    case MD_CRO_OPTIMIZE:
        dep->de_flags = MDDB_F_OPT;
        getoptdev(s, dep, 0);
        getoptdev(s, dep, 1);
        break;
    case MD_CRO_STRIPE:
        dep->de_flags = MDDB_F_STRIPE;
        break;
    case MD_CRO_MIRROR:
        dep->de_flags = MDDB_F_MIRROR;
        break;
    case MD_CRO_RAID:
        dep->de_flags = MDDB_F_RAID;
        break;
    case MD_CRO_SOFTPART:
        dep->de_flags = MDDB_F_SOFTPART;
        break;
    case MD_CRO_TRANS_MASTER:
        dep->de_flags = MDDB_F_TRANS_MASTER;
        break;
    case MD_CRO_TRANS_LOG:
        dep->de_flags = MDDB_F_TRANS_LOG;
        break;
    case MD_CRO_HOTSPARE:
        dep->de_flags = MDDB_F_HOTSPARE;
        break;
    case MD_CRO_HOTSPARE_POOL:
        dep->de_flags = MDDB_F_HOTSPARE_POOL;
        break;
    case MD_CRO_CHANGELOG:
        dep->de_flags = MDDB_F_CHANGELOG;
        break;
    }
    /*
     * try to get all blocks consecutive. If not possible
     * just get them one at a time
     */
    dep->de_blks[0] = getfreeblks(s, blkcnt);
    if (dep->de_blks[0]) {
        for (i = 1; i < blkcnt; i++)
            dep->de_blks[i] = dep->de_blks[0] + i;
    } else {
        for (i = 0; i < blkcnt;  i++)
            dep->de_blks[i] = getfreeblks(s, 1);
    }
    dep->de_rb = rbp;
    bzero((caddr_t)rbp, recsize);
    rbp->rb_magic = MDDB_MAGIC_RB;

    /* Do we have to create an old style (32 bit) record?  */
    if (options & MD_CRO_32BIT) {
        if (options & MD_CRO_FN)
            rbp->rb_revision = MDDB_REV_RBFN;
        else
            rbp->rb_revision = MDDB_REV_RB;
    } else {
        if (options & MD_CRO_FN)
            rbp->rb_revision = MDDB_REV_RB64FN;
        else
            rbp->rb_revision = MDDB_REV_RB64;
    }

    /* set de_rb_userdata for non optimization records */
    if ((options & MD_CRO_OPTIMIZE) == 0) {
        dep->de_rb_userdata = userdata;
    }

    uniqtime32(&rbp->rb_timestamp);
    /* Generate the crc for this record */
    rec_crcgen(s, dep, rbp);
    tmppnt = (caddr_t)rbp;
    /*
     * the following code writes new records to all instances of
     * the data base. Writing one block at a time to each instance
     * is safe because they are not yet in a directory entry which
     * has been written to the data base
     */
    err = 0;
    if ((options & MD_CRO_OPTIMIZE) == 0) {
        for (i = 0; i < blkcnt;  i++) {
            err |= writeall(s, (caddr_t)tmppnt,
                dep->de_blks[i], 1, 0);
            tmppnt += MDDB_BSIZE;
        }
    } else {
        if ((MD_MNSET_SETNO(s->s_setno)) &&
            md_set[s->s_setno].s_am_i_master) {
        /*
         * If a MN diskset then only master writes out newly
         * created optimized record.
         */
            err |= writeoptrecord(s, dep);
        }
    }
    uniqtime32(&dbp->db_timestamp);
    dbp->db_revision = MDDB_REV_DB;
    /* Don't include opt resync and change log records in global XOR */
    if (!(dep->de_flags & MDDB_F_OPT) &&
        !(dep->de_flags & MDDB_F_CHANGELOG))
        dbp->db_recsum ^= rbp->rb_checksum;
    db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
    create_db32rec(db32p, dbp);
    crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
    err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
    if (prevdbp) {
        dbp = prevdbp;
        uniqtime32(&dbp->db_timestamp);
        dbp->db_revision = MDDB_REV_DB;
        create_db32rec(db32p, dbp);
        crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
        err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
    }
    kmem_free((caddr_t)db32p, MDDB_BSIZE);
    if (err) {
        if (writeretry(s)) {
            s->s_zombie = newid;
            single_thread_end(s);
            mddb_setexit(s);
            return (MDDB_E_NOTNOW);
        }
    }
    single_thread_end(s);
    mddb_setexit(s);

    ASSERT((newid & MDDB_SETMASK) == 0);
    return (MAKERECID(setno, newid));
}

int
mddb_deleterec(
    mddb_recid_t    id
)
{
    mddb_set_t  *s;
    mddb_db_t   *dbp;
    mddb_db32_t *db32p;
    mddb_de_ic_t    *dep, *dep1;
    int     i;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

    s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
    ASSERT(s != NULL);

    id = DBID(id);
    if (checkstate(s, MDDB_PROBE)) {
        mddb_setexit(s);
        return (MDDB_E_NOTNOW);
    }

    ASSERT(s->s_lbp != NULL);
    single_thread_start(s);

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        dep1 = NULL;
        for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
            if (dep->de_recid == id)
                break;
            dep1 = dep;
        }
        if (dep != NULL)
            break;
    }
    /*
     * no such record
     */
    if (dep == NULL) {
        single_thread_end(s);
        ASSERT(s->s_staledeletes != 0);
        s->s_staledeletes--;
        mddb_setexit(s);
        return (0);
    }

    if (!(dep->de_flags & MDDB_F_OPT) &&
        !(dep->de_flags & MDDB_F_CHANGELOG)) {
        dbp->db_recsum ^= dep->de_rb->rb_checksum;
        dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
    }

    if (dep->de_rb_userdata != NULL) {
        if (dep->de_icreqsize)
            kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
        else
            kmem_free(dep->de_rb_userdata, dep->de_reqsize);
    }

    kmem_free((caddr_t)dep->de_rb, dep->de_recsize);

    for (i = 0; i < dep->de_blkcount; i++)
        blkfree(s, dep->de_blks[i]);
    if (dep1)
        dep1->de_next = dep->de_next;
    else
        dbp->db_firstentry = dep->de_next;

    kmem_free(dep, sizeofde(dep));

    uniqtime32(&dbp->db_timestamp);
    dbp->db_revision = MDDB_REV_DB;
    db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
    create_db32rec(db32p, dbp);
    crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
    if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
        if (writeretry(s)) {
            /*
             * staledelete is used to mark deletes which failed.
             * its only use is to not panic when the user retries
             * the delete once the database is active again
             */
            single_thread_end(s);
            s->s_staledeletes++;
            kmem_free((caddr_t)db32p, MDDB_BSIZE);
            mddb_setexit(s);
            return (MDDB_E_NOTNOW);
        }
    }
    single_thread_end(s);
    kmem_free((caddr_t)db32p, MDDB_BSIZE);
    mddb_setexit(s);
    return (0);
}

mddb_recid_t
mddb_getnextrec(
    mddb_recid_t        id,
    mddb_type_t     typ,
    uint_t          type2
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         searching, err;
    set_t           setno;

    setno = DBSET(id);
    id = DBID(id);
    searching = id;

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (err);

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (searching) {
                if (dep->de_recid == id)
                    searching = 0;
            } else {
                if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
                    (type2 == 0 || dep->de_type2 == type2)) {
                    id = dep->de_recid;
                    mddb_setexit(s);
                    ASSERT((id & MDDB_SETMASK) == 0);
                    return (MAKERECID(setno, id));
                }
            }
        }
    }

    mddb_setexit(s);

    if (searching)
        return (MDDB_E_NORECORD);
    return (0);
}

void *
mddb_getrecaddr(
    mddb_recid_t        id
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    void            *rval;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
        return (NULL);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            if (dep->de_rb_userdata)
                rval = (void *)dep->de_rb_userdata;
            else
                rval = (void *)dep->de_rb->rb_data;
            mddb_setexit(s);
            return (rval);
        }
    }

    mddb_setexit(s);
    return (NULL);
}


mddb_de_ic_t *
mddb_getrecdep(
    mddb_recid_t        id
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
        return (NULL);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            mddb_setexit(s);
            return (dep);
        }
    }

    mddb_setexit(s);
    return (NULL);
}

void *
mddb_getrecaddr_resize(
    mddb_recid_t        id,
    size_t          icsize,
    off_t           off
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    void            *rval = NULL;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
        return (NULL);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            if (dep->de_rb_userdata)
                rval = (void *)dep->de_rb_userdata;
            else
                rval = (void *)dep->de_rb->rb_data;
            break;
        }
        if (rval != NULL)
            break;
    }

    if (rval == NULL) {
        mddb_setexit(s);
        return (NULL);
    }

    if (dep->de_rb_userdata) {
        caddr_t nud;

        if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
            mddb_setexit(s);
            return (rval);
        }
        ASSERT((dep->de_reqsize + off) <= icsize);
        nud = kmem_zalloc(icsize, KM_SLEEP);
        bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
        kmem_free(dep->de_rb_userdata, dep->de_reqsize);
        dep->de_rb_userdata = nud + off;
        dep->de_rb_userdata_ic = nud;
        dep->de_icreqsize = icsize;
        rval = nud;
    } else {
        size_t recsize;
        /* LINTED variable unused - used for sizeof calculations */
        mddb_rb32_t *nrbp;

        recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
            icsize, MDDB_BSIZE);
        if (dep->de_recsize < recsize)
            cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
                "nonoptimized records can be resized\n");
    }

    mddb_setexit(s);
    return (rval);
}

int
mddb_getrecprivate(
    mddb_recid_t        id
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         err = 0;
    int         private;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
        return (err);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            private = (int)dep->de_rb->rb_private;
            mddb_setexit(s);
            return (private);
        }
    }

    mddb_setexit(s);
    return (MDDB_E_NORECORD);
}

void
mddb_setrecprivate(
    mddb_recid_t        id,
    uint_t          private
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
        ASSERT(0);
        return;
    }

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            dep->de_rb->rb_private = private;
            mddb_setexit(s);
            return;
        }
    }

    mddb_setexit(s);
    ASSERT(0);
}

mddb_type_t
mddb_getrectype1(
    mddb_recid_t        id
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         err = 0;
    mddb_type_t     rval;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
        return (err);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            rval = dep->de_type1;
            mddb_setexit(s);
            return (rval);
        }
    }

    mddb_setexit(s);
    return (MDDB_E_NORECORD);
}

int
mddb_getrectype2(
    mddb_recid_t        id
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         err = 0;
    int         rval;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
        return (err);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            rval = (int)dep->de_type2;
            mddb_setexit(s);
            return (rval);
        }
    }

    mddb_setexit(s);
    return (MDDB_E_NORECORD);
}

int
mddb_getrecsize(
    mddb_recid_t        id
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         err = 0;
    int         rval;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
        return (err);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            rval = (int)dep->de_reqsize;
            mddb_setexit(s);
            return (rval);
        }
    }

    mddb_setexit(s);
    return (MDDB_E_NORECORD);
}


mddb_recstatus_t
mddb_getrecstatus(
    mddb_recid_t        id
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         err = 0;
    mddb_recstatus_t    e_err;

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
        return ((mddb_recstatus_t)err);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid == id)
                break;
        }
        if (dep)
            break;
    }

    e_err = MDDB_OK;

    if (! dep)
        e_err = MDDB_NORECORD;
    else if (! dep->de_rb->rb_commitcnt)
        e_err = MDDB_NODATA;
    else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
        e_err = MDDB_STALE;

    mddb_setexit(s);
    return (e_err);
}

static int  mddb_commitrec_retries = 5;

/*
 * Commit given record to disk.
 * If committing an optimized record, do not call
 * with md ioctl lock held.
 */
int
mddb_commitrec(
    mddb_recid_t    id
)
{
    mddb_set_t          *s;
    mddb_db_t           *dbp;
    mddb_de_ic_t            *dep;
    mddb_recid_t            ids[2];
    mddb_rb32_t         *rbp;
    static int          err = 0;
    md_mn_msg_mddb_optrecerr_t  *msg_recerr;
    md_mn_kresult_t         *kres;
    mddb_lb_t           *lbp;
    mddb_mnlb_t         *mnlbp;
    mddb_locator_t          *lp;
    mddb_mnsidelocator_t        *mnslp;
    mddb_drvnm_t            *dn;
    int             li;
    md_replica_recerr_t     *recerr;
    int             i, j;
    int             rval;
    int             hit_err = 0;
    int             retry = mddb_commitrec_retries;
    int             gave_up = 0;

    s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
    ASSERT(s != NULL);

    if (checkstate(s, MDDB_PROBE)) {
        mddb_setexit(s);
        return (MDDB_E_NOTNOW);
    }

    if (DBID(id) == 0) {
        mddb_setexit(s);
        return (0);
    }

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
            if (dep->de_recid == DBID(id))
                break;
        }
        if (dep)
            break;
    }

    if (dep == NULL) {
        mddb_setexit(s);
        return (MDDB_E_NORECORD);
    }

    if (! (dep->de_flags & MDDB_F_OPT)) {
        ids[0] = id;
        ids[1] = 0;
        mddb_setexit(s);
        return (mddb_commitrecs(ids));
    }

    /*
     * following code allows multiple processes to be doing
     * optimization commits in parallel.
     * NOTE: if lots of optimization commits then the lock
     * will not get released until it winds down
     */
    if (s->s_optwaiterr) {
        while (s->s_optwaiterr) {
            s->s_opthungerr = 1;
            cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
        }
        if (checkstate(s, MDDB_PROBE)) {
            mddb_setexit(s);
            return (MDDB_E_NOTNOW);
        }
    }
    if (s->s_optcmtcnt++ == 0) {
        single_thread_start(s);
        s->s_opthavelck = 1;
        if (s->s_optwantlck) {
            cv_broadcast(&s->s_optwantlck_cv);
            s->s_optwantlck = 0;
        }
    } else {
        while (! s->s_opthavelck) {
            s->s_optwantlck = 1;
            cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
        }
    }

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
            if (dep->de_recid == DBID(id))
                break;
        }
        if (dep)
            break;
    }

    if (dep == NULL) {
        if (! (--s->s_optcmtcnt)) {
            single_thread_end(s);
            s->s_opthavelck = 0;
        }
        mddb_setexit(s);
        return (MDDB_E_NORECORD);
    }

    rbp = dep->de_rb;
    rbp->rb_commitcnt++;
    uniqtime32(&rbp->rb_timestamp);
    /* Generate the crc for this record */
    rec_crcgen(s, dep, rbp);

    if (writeoptrecord(s, dep)) {
        if (MD_MNSET_SETNO(s->s_setno)) {
            hit_err = 1;
        }
        s->s_optwaiterr++;
    }
    if (MD_MNSET_SETNO(s->s_setno)) {
        /* If last thread out, release single_thread_start */
        if (! (--s->s_optcmtcnt)) {
            single_thread_end(s);
            s->s_opthavelck = 0;
        }
        /*
         * If this thread had a writeoptrecords failure, then
         * need to send message to master.
         * But, multiple threads could all be running on the
         * same single_thread_start, so serialize the threads
         * by making each thread grab single_thread_start.
         *
         * After return from sending message to master message,
         * replicas associated with optimized record will havei
         * been changed (via a callback from the master to all
         * nodes), so retry call to writeoptrecord.
         * This code is replacing the call to writeretry that
         * occurs for the local and traditional disksets.
         */
        if (hit_err) {
            single_thread_start(s);
            /*
             * If > 50% of replicas are alive then continue
             * to send message to master until writeoptrecord
             * succeeds.  For now, assume that minor name,
             * major number on this node is the same as on
             * the master node.  Once devids are turned on
             * for MN disksets, can send devid.
             */
            kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
            msg_recerr = kmem_zalloc(
                sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
            while (!(md_get_setstatus(s->s_setno) &
                MD_SET_TOOFEW)) {
                bzero((caddr_t)msg_recerr,
                    sizeof (md_mn_msg_mddb_optrecerr_t));
                lbp = s->s_lbp;
                mnlbp = (mddb_mnlb_t *)lbp;
                for (i = 0; i < 2; i++) {
                    li = dep->de_optinfo[i].o_li;
                    lp = &lbp->lb_locators[li];
                    for (j = 0; j < MD_MNMAXSIDES; j++) {
                        mnslp =
                            &mnlbp->
                            lb_mnsidelocators[j][li];
                        if (mnslp->mnl_sideno ==
                            s->s_sideno)
                            break;
                    }
                    if (j == MD_MNMAXSIDES)
                        continue;

                    dn = &lbp->
                        lb_drvnm[mnslp->mnl_drvnm_index];
                    recerr = &msg_recerr->msg_recerr[i];
                    recerr->r_li = li;
                    recerr->r_flags =
                        dep->de_optinfo[i].o_flags;
                    recerr->r_blkno = lp->l_blkno;
                    recerr->r_mnum = md_getminor(lp->l_dev);
                    (void) strncpy(recerr->r_driver_name,
                        dn->dn_data, MD_MAXDRVNM);
                }

                /* Release locks */
                single_thread_end(s);
                mutex_exit(SETMUTEX(s->s_setno));

                /*
                 * Send message to master about optimized
                 * record failure.  After return, master
                 * should have marked failed replicas
                 * and sent parse message to slaves causing
                 * slaves to have fixed up the optimized
                 * record.
                 * On return from ksend_message, retry
                 * the write since this node should have fixed
                 * the optimized resync records it owns.
                 */
                rval = mdmn_ksend_message(s->s_setno,
                    MD_MN_MSG_MDDB_OPTRECERR,
                    MD_MSGF_NO_BCAST, 0,
                    (char *)msg_recerr,
                    sizeof (md_mn_msg_mddb_optrecerr_t),
                    kres);
                if (!MDMN_KSEND_MSG_OK(rval, kres)) {
                    cmn_err(CE_WARN, "mddb_commitrec: "
                        "Unable to send optimized "
                        "resync record failure "
                        "message to other nodes in "
                        "diskset %s\n", s->s_setname);
                    mdmn_ksend_show_error(rval, kres,
                        "MD_MN_MSG_MDDB_OPTRECERR");
                }

                /* Regrab locks */
                mutex_enter(SETMUTEX(s->s_setno));
                single_thread_start(s);

                /* Start over in case mddb changed */
                for (dbp = s->s_dbp; dbp != NULL;
                    dbp = dbp->db_next) {
                    for (dep = dbp->db_firstentry; dep;
                        dep = dep->de_next) {
                        if (dep->de_recid == DBID(id))
                            break;
                    }
                    if (dep)
                        break;
                }
                if (dep) {
                    rbp = dep->de_rb;
                    rbp->rb_commitcnt++;
                    uniqtime32(&rbp->rb_timestamp);
                    /* Generate the crc for this record */
                    rec_crcgen(s, dep, rbp);

                    /*
                     * If writeoptrecord succeeds, then
                     * break out.
                     */
                    if (!(writeoptrecord(s, dep)))
                        break;
                }
                if (--retry == 0) {
                    cmn_err(CE_WARN, "mddb_commitrec: "
                        "giving up writing optimized "
                        "resync record for "
                        "diskset %s, device %s,%d "
                        "blkno 0x%x, flags 0x%x\n",
                        s->s_setname, recerr->r_driver_name,
                        recerr->r_mnum, recerr->r_blkno,
                        recerr->r_flags);
                    gave_up++;
                    break;
                }
            }
            kmem_free(kres, sizeof (md_mn_kresult_t));
            kmem_free(msg_recerr,
                sizeof (md_mn_msg_mddb_optrecerr_t));

            /* Resync record should be fixed - if possible */
            s->s_optwaiterr--;
            if (s->s_optwaiterr == 0) {
                /* All errors have been handled */
                if (s->s_opthungerr) {
                    s->s_opthungerr = 0;
                    cv_broadcast(&s->s_opthungerr_cv);
                }
            }
            single_thread_end(s);
            mddb_setexit(s);
            if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
                return (MDDB_E_NOTNOW);
            } else if (gave_up) {
                return (MDDB_E_STALE);
            } else {
                return (0);
            }
        }
    } else {
        /* If set is a traditional or local set */
        if (! (--s->s_optcmtcnt)) {
            err = 0;
            if (s->s_optwaiterr) {
                err = writeretry(s);
                s->s_optwaiterr = 0;
                if (s->s_opthungerr) {
                    s->s_opthungerr = 0;
                    cv_broadcast(&s->s_opthungerr_cv);
                }
            }
            single_thread_end(s);
            s->s_opthavelck = 0;
            mddb_setexit(s);
            if (err)
                return (MDDB_E_NOTNOW);
            return (0);
        }
        if (s->s_optwaiterr) {
            while (s->s_optwaiterr) {
                s->s_opthungerr = 1;
                cv_wait(&s->s_opthungerr_cv,
                    SETMUTEX(s->s_setno));
            }
            if (checkstate(s, MDDB_NOPROBE)) {
                mddb_setexit(s);
                return (MDDB_E_NOTNOW);
            }
        }
    }

    mddb_setexit(s);
    return (0);
}

int
mddb_commitrecs(
    mddb_recid_t    ids[]
)
{
    mddb_set_t  *s;
    mddb_db_t   *dbp;
    mddb_de_ic_t    *dep;
    mddb_rb32_t *rbp;
    mddb_rb32_t *saverbp;
    mddb_lb_t   *lbp;
    int     li;
    uint_t      checksum;
    mddb_recid_t    *idp;
    int     err = 0;
    set_t       setno;

    if (panicstr)
        cmn_err(CE_PANIC, "md: mddb: commit not allowed");

    /*
     * scan through and make sure ids are from the same set
     */
    setno = DBSET(ids[0]);
    for (idp = ids; *idp != NULL; idp++)
        ASSERT(DBSET(*idp) == setno);

    s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);

    if (checkstate(s, MDDB_PROBE)) {
        mddb_setexit(s);
        return (MDDB_E_NOTNOW);
    }

    ASSERT(s->s_lbp != NULL);
    err = 0;

    if (! ids[0]) {
        mddb_setexit(s);
        return (0);
    }

    single_thread_start(s);
    /*
     * scan through and make sure ids all exist
     */
    for (idp = ids; *idp != NULL; idp++) {
        for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
            for (dep = dbp->db_firstentry; dep;
                dep = dep->de_next) {
                if (dep->de_recid == DBID(*idp))
                    break;
            }
            if (dep != NULL)
                break;
        }
        if (dep == NULL) {
            single_thread_end(s);
            mddb_setexit(s);
            return (MDDB_E_NORECORD);
        }
    }

    /*
     * scan through records fix commit counts and
     * zero fiddles and update time stamp and rechecksum record
     */
    checksum = 0;
    idp = ids;
    saverbp = NULL;
    while (*idp) {
        for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
            for (dep = dbp->db_firstentry; dep;
                dep = dep->de_next) {
                if (dep->de_recid == DBID(*idp))
                    break;
            }
            if (dep != NULL)
                break;
        }
        rbp = dep->de_rb;
        ASSERT(! (dep->de_flags & MDDB_F_OPT));

        getuserdata(setno, dep);
        /* Don't do fiddles for CHANGE LOG records */
        if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
            checksum ^= rbp->rb_checksum_fiddle;
            rbp->rb_checksum_fiddle = 0;
            checksum ^= rbp->rb_checksum;
            saverbp = rbp;
        }
        rbp->rb_commitcnt++;
        uniqtime32(&rbp->rb_timestamp);
        /* Generate the crc for this record */
        rec_crcgen(s, dep, rbp);

        /* Don't do fiddles for CHANGE LOG records */
        if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
            checksum ^= rbp->rb_checksum;
        }
        idp++;
    }

    if (saverbp)
        saverbp->rb_checksum_fiddle = checksum;

    /*
     * If this is a MN set but we are not the master, then we are not
     * supposed to update the mddb on disk. So we finish at this point.
     */
    if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
        (md_set[setno].s_am_i_master == 0)) {
        single_thread_end(s);
        mddb_setexit(s);
        return (0);
    }

    lbp = s->s_lbp;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
            continue;

        idp = ids;
        while (*idp) {
            for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
                dep = dbp->db_firstentry;
                while (dep && (dep->de_recid != DBID(*idp)))
                    dep = dep->de_next;
                if (dep != NULL)
                    break;
            }
            rbp = dep->de_rb;
            err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
                dep->de_blkcount, li, (mddb_bf_t **)0,
                MDDB_WR_ONLY_MASTER);
            if (err)
                break;
            idp++;
        }
        if (err)
            break;
    }
    if (err) {
        if (writeretry(s)) {
            single_thread_end(s);
            mddb_setexit(s);
            return (MDDB_E_NOTNOW);
        }
    }
    single_thread_end(s);
    mddb_setexit(s);
    return (0);
}

mddb_recid_t
mddb_makerecid(
    set_t       setno,
    mddb_recid_t    id
)
{
    return (MAKERECID(setno, id));
}

set_t
mddb_getsetnum(
    mddb_recid_t    id
)
{
    return (DBSET(id));
}

char *
mddb_getsetname(
    set_t   setno
)
{
    return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
}

side_t
mddb_getsidenum(
    set_t   setno
)
{
    if (md_set[setno].s_db)
        return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
    return (0);
}

int
mddb_ownset(
    set_t   setno
)
{
    if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
        return (1);

    if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
        return (1);

    return (0);
}

/*ARGSUSED*/
int
getmed_ioctl(mddb_med_parm_t *medpp, int mode)
{
    mddb_set_t  *s;
    int     err = 0;
    set_t       setno = medpp->med_setno;
    md_error_t  *ep = &medpp->med_mde;

    mdclrerror(ep);

    if (setno >= md_nsets)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
        return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    medpp->med = s->s_med;          /* structure assignment */

    mddb_setexit(s);

    return (0);
}

int
setmed_ioctl(mddb_med_parm_t *medpp, int mode)
{

    mddb_set_t  *s;
    int     err = 0;
    set_t       setno = medpp->med_setno;
    md_error_t  *ep = &medpp->med_mde;

    mdclrerror(ep);

    if ((mode & FWRITE) == 0)
        return (mdsyserror(ep, EACCES));

    /*
     * This should be the only thing that prevents LOCAL sets from having
     * mediators, at least in the kernel, userland needs to have some code
     * written.
     */
    if (setno == MD_LOCAL_SET)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if (setno >= md_nsets)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
        return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    s->s_med = medpp->med;          /* structure assignment */

    mddb_setexit(s);

    return (0);
}

int
updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
{

    mddb_set_t  *s;
    int     err = 0;
    set_t       setno = medpp->med_setno;
    md_error_t  *ep = &medpp->med_mde;

    mdclrerror(ep);

    if ((mode & FWRITE) == 0)
        return (mdsyserror(ep, EACCES));

    if (setno >= md_nsets)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
        return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    single_thread_start(s);
    (void) upd_med(s, "updmed_ioctl()");
    single_thread_end(s);

    mddb_setexit(s);

    return (0);
}

int
take_set(mddb_config_t *cp, int mode)
{
    int         err = 0;
    mddb_med_upd_parm_t medup;
    set_t           setno = cp->c_setno;
    md_error_t      *ep = &cp->c_mde;
    int         snarf_ok = 0;

    if (md_get_setstatus(setno) & MD_SET_SNARFED)
        return (0);

    err = mddb_configure(MDDB_GETDEV, cp);
    if (! err && mdisok(ep)) {
        if (md_snarf_db_set(setno, ep) != 0)
            goto out;
        snarf_ok = 1;
    }

    /*
     * Clear replicated import flag since this is
     * used during the take of a diskset with
     * previously unresolved replicated disks.
     */
    if (md_get_setstatus(setno) &
        MD_SET_REPLICATED_IMPORT) {
        md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT);
    }

    if (! err && mdisok(ep)) {
        if (! cp->c_flags) {
            medup.med_setno = setno;
            mdclrerror(&medup.med_mde);

            err = updmed_ioctl(&medup, mode);
            if (! mdisok(&medup.med_mde))
                (void) mdstealerror(ep, &medup.med_mde);
        }
    }

out:
    /*
     * In the case that the snarf failed, the diskset is
     * left with s_db set, but s_lbp not set.  The node is not
     * an owner of the set and won't be allowed to release the
     * diskset in order to cleanup.  With s_db set, any call to the
     * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
     * will cause the diskset to be loaded.  So, cleanup the diskset so
     * that an inadvertent start of the diskset doesn't happen later.
     */
    if ((snarf_ok == 0) && md_set[setno].s_db &&
        (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
        mutex_enter(&mddb_lock);
        mddb_unload_set(setno);
        mutex_exit(&mddb_lock);
    }
    return (err);
}

/*ARGSUSED*/
int
release_set(mddb_config_t *cp, int mode)
{
    int         err = 0;
    set_t           setno = cp->c_setno;
    md_error_t      *ep = &cp->c_mde;

    /*
     * Data integrity check
     */
    if (setno >= md_nsets)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    rw_enter(&md_unit_array_rw.lock, RW_WRITER);
    md_haltsnarf_enter(setno);
    /*
     * Attempt to mark set as HOLD. If it is marked as HOLD, this means
     * that the mirror code is currently searching all mirrors for a
     * errored component that needs a hotspare. While this search is in
     * progress, we cannot release the set and thgerefore we return EBUSY.
     * Once we have set HOLD, the mirror function (check_4_hotspares) will
     * block before the search until the set is released.
     */
    if (md_holdset_testandenter(setno) != 0) {
        md_haltsnarf_exit(setno);
        rw_exit(&md_unit_array_rw.lock);
        return (EBUSY);
    }

    if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
        err = mddb_configure(MDDB_RELEASESET, cp);

    md_holdset_exit(setno);
    md_haltsnarf_exit(setno);
    rw_exit(&md_unit_array_rw.lock);

    if (! err && mdisok(ep)) {
        SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
            NODEV64);
    }

    return (err);
}

int
gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_dtag_lst_t *dtlp;
    set_t       setno = dtgpp->dtgp_setno;
    md_error_t  *ep = &dtgpp->dtgp_mde;

    mdclrerror(ep);

    if ((mode & FREAD) == 0)
        return (mdsyserror(ep, EACCES));

    if (setno >= md_nsets)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    /*
     * Data tags not supported on MN sets so return invalid operation.
     * This ioctl could be called before the mddb has been read in so
     * the set status may not yet be set to MNSET, so code following
     * this check must handle a MN diskset properly.
     */
    if (md_get_setstatus(setno) & MD_SET_MNSET) {
        mddb_setexit(s);
        return (mderror(ep, MDE_INVAL_MNOP));
    }

    /* s_dtlp is NULL for MN diskset */
    dtlp = s->s_dtlp;
    while (dtlp != NULL) {
        if (dtgpp->dtgp_dt.dt_id == 0 ||
            dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
            bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
                sizeof (mddb_dtag_t));
            break;
        }
        dtlp = dtlp->dtl_nx;
    }

    /* Walked the whole list and id not found, return error */
    if (dtlp == (mddb_dtag_lst_t *)NULL) {
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
    }

    mddb_setexit(s);

    return (0);
}

int
usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_config_t   *cp;
    mddb_ri_t   *trip = NULL;
    mddb_dtag_t *dtagp = NULL;
    set_t       setno = dtupp->dtup_setno;
    md_error_t  *ep = &dtupp->dtup_mde;

    mdclrerror(ep);

    if ((mode & FWRITE) == 0)
        return (mdsyserror(ep, EACCES));

    if (setno >= md_nsets)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if (dtupp->dtup_id < 0)
        return (mdsyserror(ep, EINVAL));
    else if (dtupp->dtup_id == 0)
        return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
        return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));

    if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    /*
     * Data tags not supported on MN sets so return invalid operation.
     * This ioctl could be called before the mddb has been read in so
     * the set status may not yet be set to MNSET, so code following
     * this check must handle a MN diskset properly.
     */
    if (md_get_setstatus(setno) & MD_SET_MNSET) {
        mddb_setexit(s);
        return (mderror(ep, MDE_INVAL_MNOP));
    }

    /* Validate and find the id requested - nothing found if MN diskset */
    if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
    }

    /* Usetag is only valid when more than one tag exists */
    if (dtl_cntl(s) < 2) {
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
    }

    /* Put the selected tag in place */
    dt_setup(s, dtagp);

    cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);

    /* Save the hint information */
    trip = save_rip(s);

    cp->c_timestamp = s->s_ident.createtime;    /* struct assignment */
    cp->c_setno = setno;
    cp->c_sideno = s->s_sideno;
    (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
    cp->c_setname[MD_MAX_SETNAME] = '\0';
    cp->c_med = s->s_med;               /* struct assignment */

    mddb_setexit(s);

    s = NULL;

    /* shorthand */
    setno = cp->c_setno;

    /* Let unload know not to free the tag */
    md_set_setstatus(setno, MD_SET_KEEPTAG);

    /* Release the set */
    if (err = release_set(cp, mode))
        goto out;

    if (! mdisok(&cp->c_mde)) {
        (void) mdstealerror(ep, &cp->c_mde);
        err = 1;
        goto out;
    }

    /* Re-init set using the saved mddb_config_t structure */
    if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
        if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
            err = mddbstatus2error(ep, err, NODEV32, setno);
            goto out;
        }
    }

    ASSERT(s->s_rip == (mddb_ri_t *)NULL);

    /* use the saved rip structure */
    s->s_rip = trip;
    trip = (mddb_ri_t *)NULL;

    /* Let the take code know a tag is being used */
    md_set_setstatus(setno, MD_SET_USETAG);

    mddb_setexit(s);

    s = NULL;

    /* Take the set */
    if (err = take_set(cp, mode))
        goto out;

    if (! mdisok(&cp->c_mde))
        (void) mdstealerror(ep, &cp->c_mde);

out:
    md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));

    kmem_free(cp, sizeof (mddb_config_t));

    if (trip)
        free_rip(&trip);

    if (s)
        mddb_setexit(s);

    return (err);
}

int
accept_ioctl(mddb_accept_parm_t *accpp, int mode)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_config_t   *cp;
    mddb_ri_t   *trip = NULL;
    set_t       setno = accpp->accp_setno;
    md_error_t  *ep = &accpp->accp_mde;

    mdclrerror(ep);

    if ((mode & FWRITE) == 0)
        return (mdsyserror(ep, EACCES));

    if (setno >= md_nsets)
        return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
        return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    /*
     * Data tags not supported on MN sets so return invalid operation.
     * mddb is guaranteed to be incore at this point, so this
     * check will catch all MN disksets.
     */
    if (md_get_setstatus(setno) & MD_SET_MNSET) {
        mddb_setexit(s);
        return (mderror(ep, MDE_INVAL_MNOP));
    }

    cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);

    trip = save_rip(s);

    cp->c_timestamp = s->s_ident.createtime;    /* struct assignment */
    cp->c_setno = setno;
    cp->c_sideno = s->s_sideno;
    (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
    cp->c_setname[MD_MAX_SETNAME] = '\0';
    cp->c_med = s->s_med;               /* struct assignment */

    /* Tag the data */
    if (err = set_dtag(s, ep)) {
        err = mdsyserror(ep, err);
        goto out;
    }

    /* If we had a BADTAG, it will be re-written, so clear the bit. */
    if (md_get_setstatus(setno) & MD_SET_BADTAG)
        md_clr_setstatus(setno, MD_SET_BADTAG);

    if (err = dt_write(s)) {
        err = mdsyserror(ep, err);
        goto out;
    }

    mddb_setexit(s);

    s = NULL;

    /* shorthand */
    setno = cp->c_setno;

    /* Clear the keeptag */
    md_clr_setstatus(setno, MD_SET_KEEPTAG);

    /* Release the set */
    if (err = release_set(cp, mode))
        goto out;

    if (! mdisok(&cp->c_mde)) {
        (void) mdstealerror(ep, &cp->c_mde);
        goto out;
    }

    /* Re-init set using the saved mddb_config_t structure */
    if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
        if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
            err = mddbstatus2error(ep, err, NODEV32, setno);
            goto out;
        }
    }

    ASSERT(s->s_rip == (mddb_ri_t *)NULL);

    /* Free the allocated rip structure */
    if (s->s_rip != (mddb_ri_t *)NULL)
        free_rip(&s->s_rip);

    /* use the saved rip structure */
    s->s_rip = trip;
    trip = (mddb_ri_t *)NULL;

    /* Let the set init code know an accept is in progress */
    md_set_setstatus(setno, MD_SET_ACCEPT);

    mddb_setexit(s);

    s = NULL;

    /* Take the set */
    if (err = take_set(cp, mode))
        goto out;

    if (! mdisok(&cp->c_mde))
        (void) mdstealerror(ep, &cp->c_mde);

out:
    md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));

    kmem_free(cp, sizeof (mddb_config_t));

    if (trip)
        free_rip(&trip);

    if (s)
        mddb_setexit(s);

    return (err);
}

/*
 * mddb_getinvlb_devid - cycles through the locator block and determines
 *      if the device id's for any of the replica disks are invalid.
 *      If so, it returns the diskname in the ctdptr.
 *  RETURN
 *      -1  Error
 *      cnt number of invalid device id's
 */
int
mddb_getinvlb_devid(
    set_t   setno,
    int count,
    int size,
    char    **ctdptr
)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_lb_t   *lbp;
    int     li;
    mddb_did_blk_t  *did_blk;
    mddb_did_info_t *did_info;
    int     len;
    int     cnt = 0;
    char        *cptr;
    md_name_suffix  *sn;
    int     i, dont_add_it;
    char        *tmpctd, *diskname;
    char        *tmpname;

    cptr = *ctdptr;
    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
        return (-1);
    }

    single_thread_start(s);
    lbp = s->s_lbp;

    if (lbp->lb_setno != setno) {
        single_thread_end(s);
        mddb_setexit(s);
        return (-1);
    }

    /* check for lb being devid style */
    if (lbp->lb_flags & MDDB_DEVID_STYLE) {
        did_blk = s->s_did_icp->did_ic_blkp;
        for (li = 0; li < lbp->lb_loccnt; li++) {
            did_info = &(did_blk->blk_info[li]);
            /* Only if devid exists and isn't valid */
            if ((did_info->info_flags & MDDB_DID_EXISTS) &&
                !(did_info->info_flags & MDDB_DID_VALID)) {
                /*
                 * if we count more invalid did's than
                 * was passed in there's an error somewhere
                 */
                if (cnt++ > count) {
                    single_thread_end(s);
                    mddb_setexit(s);
                    return (-1);
                }

                /*
                 * Future note: Need to do something here
                 * for the MN diskset case when device ids
                 * are supported in disksets.
                 * Can't add until merging devids_in_diskset
                 * code into code base.
                 */

                sn = &s->s_lnp->ln_suffixes[0][li];
                /*
                 * check to make sure length of device name is
                 * not greater than computed first time through
                 */
                len = sn->suf_len;
                if (len > size) {
                    single_thread_end(s);
                    mddb_setexit(s);
                    return (-1);
                }
                tmpctd = *ctdptr;
                /* strip off slice part */
                diskname = md_strdup(sn->suf_data);
                tmpname = strrchr(diskname, 's');
                *tmpname = '\0';
                dont_add_it = 0;
                /* look to see if diskname is already in list */
                for (i = 0; i < (cnt-1); i++) {
                    if (strcmp(diskname, tmpctd) == 0) {
                        /* already there, don't add */
                        dont_add_it = 1;
                        break;
                    }
                    /* point to next diskname in list */
                    tmpctd += size;
                }
                if (dont_add_it == 0) {
                    /* add diskname to list */
                    (void) strcpy(cptr, diskname);
                    cptr += size;
                }
                kmem_free(diskname, strlen(sn->suf_data) + 1);
            }
        }
    }
    /* null terminate the list */
    *cptr = '\0';
    /*
     * need to save the new pointer so that calling routine can continue
     * to add information onto the end.
     */
    *ctdptr = cptr;
    single_thread_end(s);
    mddb_setexit(s);
    return (cnt);
}

/*
 * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
 *      track of length of longest devicename.
 *  RETURN
 *      -1  error
 *       cnt    number of lb's with invalid devid's
 */
int
mddb_validate_lb(
    set_t   setno,
    int *rmaxsz
)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_lb_t   *lbp;
    int     li;
    mddb_did_blk_t  *did_blk;
    mddb_did_info_t *did_info;
    int     len;
    int     cnt = 0;

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (-1);

    single_thread_start(s);
    lbp = s->s_lbp;

    if (lbp->lb_setno != setno) {
        single_thread_end(s);
        mddb_setexit(s);
        return (-1);
    }

    /* lb must be in devid style */
    if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
        goto mvl_out;

    did_blk = s->s_did_icp->did_ic_blkp;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        char        *minor_name;
        mddb_locator_t  *lp;
        dev_t       ddi_dev;
        ddi_devid_t devid;
        ddi_devid_t rtn_devid = NULL;
        int     get_rval;

        did_info = &(did_blk->blk_info[li]);
        if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
            (did_info->info_flags & MDDB_DID_VALID))
            continue;

        /* Here we know, did exists but isn't valid */

        lp = &lbp->lb_locators[li];
        ddi_dev = expldev(lp->l_dev);
        get_rval = mddb_devid_get(s, li, &devid, &minor_name);
        ASSERT(get_rval == 1);
        if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
            (ddi_devid_compare(rtn_devid, devid) == 0)) {
            did_info->info_flags = MDDB_DID_VALID |
                MDDB_DID_EXISTS | MDDB_DID_UPDATED;
        } else {
            cnt++;
            /*
             * Future note: Need to do something here
             * for the MN diskset case when device ids
             * are supported in disksets.
             * Can't add until merging devids_in_diskset
             * code into code base.
             */
            len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
            if (*rmaxsz < len)
                *rmaxsz = len;
        }
        if (rtn_devid != NULL)
            ddi_devid_free(rtn_devid);
    }

mvl_out:

    if (push_lb(s) != 0)
        cnt = -1;
    (void) upd_med(s, "mddb_validate_lb(0)");
    single_thread_end(s);
    mddb_setexit(s);
    return (cnt);
}

int
check_active_locators()
{
    mddb_set_t  *s;
    mddb_lb_t   *lbp;
    int     li;
    int     active = 0;

    mutex_enter(&mddb_lock);
    /* there is nothing here..so we can unload */
    if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
        mutex_exit(&mddb_lock);
        return (0);
    }
    s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
    lbp = s->s_lbp;
    if (lbp == NULL) {
        mutex_exit(&mddb_lock);
        return (0);
    }

    for (li = 0; li < lbp->lb_loccnt; li++) {
        mddb_locator_t *lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_ACTIVE) {
            active = 1;
            break;
        }
    }
    mutex_exit(&mddb_lock);
    return (active);
}

/*
 * regetoptrecord:
 * --------------
 *  Update the in-core optimized resync record contents by re-reading the
 *  record from the on-disk metadb.
 *  The contents of the resync record will be overwritten by calling this
 *  routine. This means that callers that require the previous contents to
 *  be preserved must save the data before calling this routine.
 *  Return values:
 *  0 - successfully read in resync record from a mddb
 *  1 - failure.  Unable to read resync record from either mddb.
 */
static int
regetoptrecord(
    mddb_set_t  *s,
    mddb_de_ic_t    *dep
)
{
    mddb_lb_t   *lbp;
    mddb_locator_t  *lp;
    mddb_rb32_t *rbp, *crbp;
    int     li;
    int     i;
    int     err = 0;
    size_t      recsize;

#if defined(_ILP32) && !defined(lint)
    ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

    recsize = dep->de_recsize;
    crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);

    single_thread_start(s);
    rbp = dep->de_rb;

    dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
    dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;

    lbp = s->s_lbp;

    for (i = 0; i < 2; i++) {
        if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
            continue;
        li = dep->de_optinfo[i].o_li;
        lp = &lbp->lb_locators[li];

        if (! (lp->l_flags & MDDB_F_ACTIVE) ||
            (lp->l_flags & MDDB_F_EMASTER))
            continue;

        /*
         * re-read the optimized resync record with failfast set
         * since a failed disk could lead to a very long wait.
         */
        err = readblklst(s, (caddr_t)rbp, dep->de_blks,
            dep->de_blkcount, li, B_FAILFAST);

        if (err)
            continue;

        if (rbp->rb_magic != MDDB_MAGIC_RB)
            continue;

        if (revchk(MDDB_REV_RB, rbp->rb_revision))
            continue;

        /* Check the crc for this record */
        if (rec_crcchk(s, dep, rbp)) {
            continue;
        }
        dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;

        if (rbp == crbp) {
            if (rbp->rb_checksum != crbp->rb_checksum)
                dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
            break;
        }
        rbp = crbp;
    }

    single_thread_end(s);

    if (rbp == crbp) {
        rbp->rb_private = 0;
        kmem_free((caddr_t)crbp, recsize);
        return (0);
    }
    uniqtime32(&rbp->rb_timestamp);
    /* Generate the crc for this record */
    rec_crcgen(s, dep, rbp);
    kmem_free((caddr_t)crbp, recsize);
    return (1);
}

/*
 * mddb_reread_rr:
 *  Re-read the resync record from the on-disk copy. This is required for
 *  multi-node support so that a new mirror-owner can determine if a resync
 *  operation is required to guarantee data integrity.
 *
 * Arguments:
 *  setno   Associated set
 *  id  Resync record ID
 *
 * Return Value:
 *  0   successful reread
 *  -1  invalid set (not multi-node or non-existant)
 *  >0  metadb state invalid, failed to reread
 */
int
mddb_reread_rr(
    set_t       setno,
    mddb_recid_t    id
)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_db_t   *dbp;
    mddb_de_ic_t    *dep;

    if (setno >= md_nsets)
        return (-1);

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
        return (-1);

    if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
        mddb_setexit(s);
        return (-1);
    }

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        dep = dbp->db_firstentry;
        while (dep && (dep->de_recid != DBID(id)))
            dep = dep->de_next;
        if (dep != NULL)
            break;
    }

    if (dep != NULL) {
        err = regetoptrecord(s, dep);
    } else {
        err = -1;
    }
    mddb_setexit(s);
    return (err);
}

/*
 * Set owner associated with MN optimized resync record.
 *
 * Optimized records have an owner node associated with them in
 * a MN diskset.  The owner is only set on a node that is actively
 * writing to that record.  The other nodes will show that record
 * as having an invalid owner.  The owner for an optimized record
 * is used during fixoptrecord to determine which node should
 * write out the record when the replicas associated with that
 * optimized record have been changed.
 *
 * Called directly from mirror driver and not from an ioctl.
 *
 * Returns
 *  NULL if successful.
 *  MDDB_E_NORECORD if record not found.
 */
int
mddb_setowner(
    mddb_recid_t        id,
    md_mn_nodeid_t      owner
)
{
    mddb_set_t      *s;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         found = 0;


    if (DBSET(id) >= md_nsets)
        return (MDDB_E_NORECORD);

    if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
        return (MDDB_E_NORECORD);

    id = DBID(id);
    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry;
            dep != NULL; dep = dep->de_next) {
            if (dep->de_recid != id)
                continue;
            dep->de_owner_nodeid = owner;
            found = 1;
            break;
        }
        if (found)
            break;
    }

    mddb_setexit(s);

    if (!found) {
        return (MDDB_E_NORECORD);
    }

    return (NULL);
}

/*
 * mddb_parse re-reads portions of the mddb from disk given a list
 * of good replicas to read from and flags describing
 * which portion of the mddb to read in.
 *
 * Used in a MN diskset when the master has made a change to some part
 * of the mddb and wants to relay this information to the slaves.
 */
int
mddb_parse(mddb_parse_parm_t *mpp)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_locator_t  *lp, *old_lp;
    mddb_lb_t   *lbp, *old_lbp;
    int     rval = 0;
    int     i, li;
    int     found_good_one = 0;
    mddb_ln_t   *lnp;
    mddb_block_t    ln_blkcnt;
    md_error_t  *ep = &mpp->c_mde;

    if (mpp->c_setno >= md_nsets)
        return (EINVAL);

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
        return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
    }

    if (!(MD_MNSET_SETNO(mpp->c_setno))) {
        mddb_setexit_no_parse(s);
        return (EINVAL);
    }

    /*
     * Master node initiated this request, so there's no work for
     * the master node to do.
     */
    if (md_set[mpp->c_setno].s_am_i_master) {
        mddb_setexit_no_parse(s);
        return (rval);
    }

    single_thread_start(s);

    if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
        lbp = 0;
        for (i = 0; i < MDDB_NLB; i++) {
            /* Walk through master's active list */
            if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
                continue;
            if (s->s_mbiarray[i] == NULL)
                continue;

            /* Assumes master blocks are already setup */
            if (lbp == (mddb_lb_t *)NULL) {
                lbp = (mddb_lb_t *)kmem_zalloc(
                    dbtob(MDDB_MNLBCNT), KM_SLEEP);
            }
            err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);

            if (err)
                continue;

            if (lbp->lb_magic != MDDB_MAGIC_LB)
                continue;
            if (lbp->lb_blkcnt != MDDB_MNLBCNT)
                continue;
            if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
                continue;
            if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
                NULL))
                continue;
            if (lbp->lb_setno != s->s_setno)
                continue;
            /*
             * a commit count of zero means this locator has
             * been deleted
             */
            if (lbp->lb_commitcnt == 0) {
                continue;
            }
            /* Found a good locator - keep it */
            found_good_one = 1;
            break;
        }

        /*
         * If found a good copy of the mddb, then read it into
         * this node's locator block.  Fix up the set's s_mbiarray
         * pointer (master block incore array pointer) to be
         * in sync with the newly read in locator block.  If a
         * new mddb was added, read in the master blocks associated
         * with the new mddb.  If an mddb was deleted, free the
         * master blocks associated with deleted mddb.
         */
        if (found_good_one)  {
            /* Compare old and new view of mddb locator blocks */
            old_lbp = s->s_lbp;
            for (li = 0; li < lbp->lb_loccnt; li++) {
                int mn_set;

                lp = &lbp->lb_locators[li];
                old_lp = &old_lbp->lb_locators[li];

                /* If old and new views match, continue */
                if ((lp->l_flags & MDDB_F_ACTIVE) ==
                    (old_lp->l_flags & MDDB_F_ACTIVE))
                    continue;

                if (lp->l_flags & MDDB_F_ACTIVE) {
                    /*
                     * If new mddb has been added - delete
                     * old mbiarray and get new one.
                     *
                     * When devids are supported, will
                     * need to get dev from devid.
                     */
                    if (s->s_mbiarray[li]) {
                        free_mbipp(&s->s_mbiarray[li]);
                    }
                    /*
                     * If getmasters fails, getmasters
                     * will set appropriate error flags.
                     */
                    s->s_mbiarray[li] = getmasters(s,
                        md_expldev(lp->l_dev), lp->l_blkno,
                        (uint_t *)&(lp->l_flags), &mn_set);
                } else if (lp->l_flags & MDDB_F_DELETED) {
                    /*
                     * If old one has been deleted -
                     * delete old mbiarray.
                     */
                    if (s->s_mbiarray[li]) {
                        free_mbipp(&s->s_mbiarray[li]);
                    }
                }
            }

            /* Free this node's old view of mddb locator blocks */
            kmem_free((caddr_t)s->s_lbp,
                dbtob(s->s_lbp->lb_blkcnt));
            s->s_lbp = lbp;
        } else {
            if (lbp)
                kmem_free(lbp, dbtob(MDDB_MNLBCNT));
        }
    }

    if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
        lnp = s->s_lnp;
        lbp = s->s_lbp;
        ln_blkcnt = lbp->lb_lnblkcnt;
        s->s_lnp = NULL; /* readlocnames does this anyway */
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];

            if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
                (lp->l_flags & MDDB_F_EMASTER))
                continue;

            /* Successfully read the locator names */
            if (readlocnames(s, li) == 0)
                break;
        }

        if (li == lbp->lb_loccnt) {
            /* Did not successfully read locnames; restore lnp */
            s->s_lnp = lnp;
        } else {
            /* readlocnames successful, free old struct */
            kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
        }
    }

    if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
        mddb_de_ic_t    *dep, *tdep, *first_dep, *dep2;
        mddb_db_t   *dbp;
        mddb_db32_t *db32p;
        mddb_de32_t *de32p, *de32p2;
        int     writeout;

        lbp = s->s_lbp;
        /*
         * Walk through directory block and directory entry incore
         * linked list looking for optimized resync records.
         * For each opt record found, re-read in directory block.
         * The directoy block consists of a number of directory
         * entries.  The directory entry for this opt record will
         * describe which 2 mddbs actually contain the resync record
         * since it could have been relocated by the master node
         * due to mddb failure or mddb deletion.  If this node
         * is the record owner for this opt record, then write out
         * the record to the 2 mddbs listed in the directory entry
         * if the mddbs locations are different than previously known.
         */
        for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
            for (dep = dbp->db_firstentry; dep;
                dep = dep->de_next) {
                /* Found an opt record */
                if (dep->de_flags & MDDB_F_OPT)
                    break;
            }
            /* If no opt records found, go to next dbp */
            if (dep == NULL)
                continue;

            /*
             * Reread directory block from disk since
             * master could have rewritten in during fixoptrecord.
             */
            db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
                KM_SLEEP);
            create_db32rec(db32p, dbp);
            for (li = 0; li < lbp->lb_loccnt; li++) {
                lp = &lbp->lb_locators[li];

                if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
                    (lp->l_flags & MDDB_F_EMASTER))
                    continue;

                err = readblks(s, (caddr_t)db32p,
                    db32p->db32_blknum, 1, li);
                if (err)
                    continue;

                /* Reverify db; go to next mddb if bad */
                if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
                    (revchk(MDDB_REV_DB,
                    db32p->db32_revision)) ||
                    (crcchk(db32p, &db32p->db32_checksum,
                    MDDB_BSIZE, NULL))) {
                    continue;
                } else {
                    break;
                }
            }
            /*
             * If all mddbs are unavailable then panic since
             * this slave cannot be allowed to continue out-of-sync
             * with the master node.  Since the optimized resync
             * records are written by all nodes, all nodes must
             * stay in sync with the master.
             *
             * This also handles the case when all storage
             * connectivity to a slave node has failed.  The
             * slave node will send an MDDB_OPTRECERR message to
             * the master node when the slave node has been unable
             * to write an optimized resync record to both
             * designated mddbs.  After the master has fixed the
             * optimized records to be on available mddbs, the
             * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
             * is sent to all slave nodes.  If a slave node is
             * unable to access any mddb in order to read in the
             * relocated optimized resync record, then the slave
             * node must panic.
             */
            if (li == lbp->lb_loccnt) {
                kmem_free((caddr_t)db32p, MDDB_BSIZE);
                cmn_err(CE_PANIC, "md: mddb: Node unable to "
                    "access any SVM state database "
                    "replicas for diskset %s\n", s->s_setname);
            }
            /*
             * Setup temp copy of linked list of de's.
             * Already have an incore copy, but need to walk
             * the directory entry list contained in the
             * new directory block that was just read in above.
             * After finding the directory entry of an opt record
             * by walking the incore list, find the corresponding
             * entry in the temporary list and then update
             * the incore directory entry record with
             * the (possibly changed) mddb location stored
             * for the optimized resync records.
             */
            de32p = (mddb_de32_t *)
                ((void *) ((caddr_t)
                (&db32p->db32_firstentry)
                + sizeof (db32p->db32_firstentry)));
            tdep = (mddb_de_ic_t *)
                kmem_zalloc(sizeof (mddb_de_ic_t) -
                sizeof (mddb_block_t) +
                sizeof (mddb_block_t) *
                de32p->de32_blkcount, KM_SLEEP);
            de32tode(de32p, tdep);
            first_dep = tdep;
            while (de32p && de32p->de32_next) {
                de32p2 = nextentry(de32p);
                dep2 = (mddb_de_ic_t *)kmem_zalloc(
                    sizeof (mddb_de_ic_t) -
                    sizeof (mddb_block_t) +
                    sizeof (mddb_block_t) *
                    de32p2->de32_blkcount, KM_SLEEP);
                de32tode(de32p2, dep2);
                tdep->de_next = dep2;
                tdep = dep2;
                de32p = de32p2;
            }

            /* Now, walk the incore directory entry list */
            for (dep = dbp->db_firstentry; dep;
                dep = dep->de_next) {
                if (! (dep->de_flags & MDDB_F_OPT))
                    continue;
                /*
                 * Found an opt record in the incore copy.
                 * Find the corresponding entry in the temp
                 * list.  If anything has changed in the
                 * opt record info between the incore copy
                 * and the temp copy, update the incore copy
                 * and set a flag to writeout the opt record
                 * to the new mddb locations.
                 */
                for (tdep = first_dep; tdep;
                    tdep = tdep->de_next) {
                    if (dep->de_recid == tdep->de_recid) {
                        writeout = 0;
                        /* Check first mddb location */
                        if ((dep->de_optinfo[0].o_li !=
                            tdep->de_optinfo[0].o_li) ||
                            (dep->de_optinfo[0].
                            o_flags != tdep->de_optinfo
                            [0].o_flags)) {
                            dep->de_optinfo[0] =
                                tdep->de_optinfo[0];
                            writeout = 1;
                        }
                        /* Check second mddb location */
                        if ((dep->de_optinfo[1].o_li !=
                            tdep->de_optinfo[1].o_li) ||
                            (dep->de_optinfo[1].
                            o_flags != tdep->de_optinfo
                            [1].o_flags)) {
                            dep->de_optinfo[1] =
                                tdep->de_optinfo[1];
                            writeout = 1;
                        }
                        /*
                         * Record owner should rewrite
                         * it
                         */
                        if ((writeout) &&
                            (dep->de_owner_nodeid ==
                            md_set[mpp->c_setno].
                            s_nodeid))
                            (void) writeoptrecord(s,
                                dep);
                        break;
                    }
                }
            }
            /*
             * Update the incore checksum information for this
             * directory block to match the newly read in checksum.
             * This should have only changed if the incore and
             * temp directory entries differed, but it takes
             * more code to do the check than to just update
             * the information everytime.
             */
            dbp->db_checksum = db32p->db32_checksum;

            /* Now free everything */
            tdep = first_dep;
            while (tdep) {
                dep2 = tdep->de_next;
                kmem_free((caddr_t)tdep,
                    sizeofde(tdep));
                tdep = dep2;
            }
            kmem_free((caddr_t)db32p, MDDB_BSIZE);
        }
        rval = 0;
    }
out:
    single_thread_end(s);
    mddb_setexit_no_parse(s);
    return (rval);
}

int
mddb_block(mddb_block_parm_t *mbp)
{
    mddb_set_t  *s;
    int     err = 0;
    md_error_t  *ep = &mbp->c_mde;

    if (mbp->c_setno >= md_nsets)
        return (EINVAL);

    /*
     * If the new_master flag is set for this setno we are in the middle
     * of a reconfig cycle, and blocking or unblocking is not needed.
     * Hence we can return success immediately
     */
    if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
        return (0);
    }

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
        return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
    }

    if (!(MD_MNSET_SETNO(mbp->c_setno))) {
        mddb_setexit_no_parse(s);
        return (EINVAL);
    }

    single_thread_start(s);

    if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
        md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);

    if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
        md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);

    single_thread_end(s);
    mddb_setexit_no_parse(s);
    return (err);
}

/*
 * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
 * to relocate any optimized resync records to available mddbs.
 * This routine is only called on the master node.
 *
 * Used in a MN diskset when a slave node has failed to write an optimized
 * resync record.  The failed mddb information is sent to the master node
 * so the master can relocate the optimized records, if possible.  If the
 * failed mddb information has a mddb marked as failed that was previously
 * marked active on the master, the master sets its incore mddb state to
 * EWRITE and sets the PARSE_LOCBLK flag.  The master node then attempts
 * to relocate any optimized records on the newly failed mddbs by calling
 * fixoptrecords.  (fixoptrecords will set the PARSE_OPTRECS flag if any
 * optimized records are relocated.)
 *
 * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
 * flags and will send a PARSE message to the slave nodes.  The PARSE_LOCBLK
 * flag causes the slave node to re-read in the locator block from disk.
 * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
 * blocks and write out any optimized resync records that have been
 * relocated to a different mddb.
 */
int
mddb_optrecfix(mddb_optrec_parm_t *mop)
{
    mddb_set_t      *s;
    int         err = 0;
    mddb_lb_t       *lbp;
    mddb_mnlb_t     *mnlbp;
    mddb_locator_t      *lp;
    int         li;
    mddb_mnsidelocator_t    *mnslp;
    mddb_drvnm_t        *dn;
    int         i, j;
    md_replica_recerr_t *recerr;
    md_error_t      *ep = &mop->c_mde;
    int         something_changed = 0;
    int         alc, lc;
    int         setno;

    setno = mop->c_setno;
    if (mop->c_setno >= md_nsets)
        return (EINVAL);

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
        return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
    }

    if (!(MD_MNSET_SETNO(mop->c_setno))) {
        mddb_setexit(s);
        return (EINVAL);
    }

    single_thread_start(s);
    lbp = s->s_lbp;
    mnlbp = (mddb_mnlb_t *)lbp;

    /*
     * If slave node has seen an mddb failure, but the master node
     * hasn't encountered this failure, mark the mddb as failed on
     * the master node and set the something_changed flag to 1.
     */
    for (i = 0; i < 2; i++) {
        recerr = &mop->c_recerr[i];
        if (recerr->r_flags & MDDB_F_EWRITE) {
            li = recerr->r_li;
            lp = &lbp->lb_locators[li];
            for (j = 0; j < MD_MNMAXSIDES; j++) {
                mnslp = &mnlbp->lb_mnsidelocators[j][li];
                if (mnslp->mnl_sideno == s->s_sideno)
                    break;
            }
            /* Do quick check using li */
            if (j != MD_MNMAXSIDES)
                dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];

            if ((j != MD_MNMAXSIDES) &&
                (strncmp(dn->dn_data, recerr->r_driver_name,
                MD_MAXDRVNM) == 0) &&
                (recerr->r_blkno == lp->l_blkno) &&
                (recerr->r_mnum == mnslp->mnl_mnum)) {
                if ((lp->l_flags & MDDB_F_ACTIVE) ||
                    ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
                    something_changed = 1;
                    lp->l_flags |= MDDB_F_EWRITE;
                    lp->l_flags &= ~MDDB_F_ACTIVE;
                }
            } else {
                /*
                 * Passed in li from slave does not match
                 * the replica in the master's structures.
                 * This could have occurred if a delete
                 * mddb command was running when the
                 * optimized resync record had a failure.
                 * Search all replicas for this entry.
                 * If no match, just ignore.
                 * If a match, set replica in error.
                 */
                for (li = 0; li < lbp->lb_loccnt; li++) {
                    lp = &lbp->lb_locators[li];
                    if (lp->l_flags & MDDB_F_DELETED)
                        continue;

                    for (j = 0; j < MD_MNMAXSIDES; j++) {
                        mnslp =
                            &mnlbp->
                            lb_mnsidelocators[j][li];
                        if (mnslp->mnl_sideno ==
                            s->s_sideno)
                            break;
                    }
                    if (j == MD_MNMAXSIDES)
                        continue;

                    dn = &lbp->
                        lb_drvnm[mnslp->mnl_drvnm_index];
                    if ((strncmp(dn->dn_data,
                        recerr->r_driver_name,
                        MD_MAXDRVNM) == 0) &&
                        (recerr->r_blkno == lp->l_blkno) &&
                        (recerr->r_mnum ==
                        mnslp->mnl_mnum)) {
                        if ((lp->l_flags &
                            MDDB_F_ACTIVE) ||
                            ((lp->l_flags &
                            MDDB_F_EWRITE) == 0)) {
                            something_changed = 1;
                            lp->l_flags |=
                                MDDB_F_EWRITE;
                            lp->l_flags &=
                                ~MDDB_F_ACTIVE;
                        }
                        break;
                    }
                }
            }
        }
    }

    /*
     * If this message changed nothing, then we're done since this
     * failure has already been handled.
     * If some mddb state has been changed, send a parse message to
     * the slave nodes so that the slaves will re-read the locator
     * block from disk.
     */
    if (something_changed == 0) {
        single_thread_end(s);
        mddb_setexit(s);
        return (0);
    } else {
        s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
    }

    /*
     * Scan replicas setting MD_SET_TOOFEW if
     * 50% or more of the mddbs have seen errors.
     * Note: Don't call selectreplicas or writeretry
     * since these routines may end up setting the ACTIVE flag
     * on a failed mddb if the master is able to access the mddb
     * but the slave node couldn't.  Need to have the ACTIVE flag
     * turned off in order to relocate the optimized records to
     * mddbs that are (hopefully) available on all nodes.
     */
    alc = 0;
    lc = 0;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        lc++;
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        alc++;
    }

    /*
     * If more than 50% mddbs have failed, then don't relocate opt recs.
     * The node sending the mddb failure information will detect TOOFEW
     * and will panic when it attempts to re-write the optimized record.
     */
    if (alc < ((lc + 1) / 2)) {
        md_set_setstatus(setno, MD_SET_TOOFEW);
        (void) push_lb(s);
        (void) upd_med(s, "mddb_optrecfix(0)");
        single_thread_end(s);
        mddb_setexit(s);
        return (0);
    }

    /* Attempt to relocate optimized records that are on failed mddbs */
    (void) fixoptrecords(s);

    /* Push changed locator block out to disk */
    (void) push_lb(s);
    (void) upd_med(s, "mddb_optrecfix(1)");

    /* Recheck for TOOFEW after writing out locator blocks */
    alc = 0;
    lc = 0;
    for (li = 0; li < lbp->lb_loccnt; li++) {
        lp = &lbp->lb_locators[li];
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        lc++;
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        alc++;
    }

    /* If more than 50% mddbs have failed, then don't relocate opt recs */
    if (alc < ((lc + 1) / 2)) {
        md_set_setstatus(setno, MD_SET_TOOFEW);
        single_thread_end(s);
        mddb_setexit(s);
        return (0);
    }

    single_thread_end(s);
    mddb_setexit(s);
    return (0);
}

/*
 * Check if incore mddb on master node matches ondisk mddb.
 * If not, master writes out incore view to all mddbs.
 * Have previously verified that master is an owner of the
 * diskset (master has snarfed diskset) and that diskset is
 * not stale.
 *
 * Meant to be called during reconfig cycle during change of master.
 * Previous master in diskset may have changed the mddb and
 * panic'd before relaying information to slave nodes.  New
 * master node just writes out its incore view of the mddb and
 * the replay of the change log will resync all the nodes.
 *
 * Only supported for MN disksets.
 *
 * Return values:
 *  0 - success
 *  non-zero - failure
 */
int
mddb_check_write_ioctl(mddb_config_t *info)
{
    int         err = 0;
    set_t           setno = info->c_setno;
    mddb_set_t      *s;
    int         li;
    mddb_locator_t      *lp;
    mddb_lb_t       *lbp;
    mddb_mnlb_t     *mnlbp_od;
    mddb_ln_t       *lnp;
    mddb_mnln_t     *mnlnp_od;
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    int         write_out_mddb;
    md_error_t      *ep = &info->c_mde;
    int         mddb_err = 0;
    int         prev_li = 0;
    int         rval = 0;
    int         alc, lc;
    int         mddbs_present = 0;

    /* Verify that setno is in valid range */
    if (setno >= md_nsets)
        return (EINVAL);

    if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
        return (0);

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
        return (mddbstatus2error(ep, err, NODEV32, setno));
    }

    /* Calling diskset must be a MN diskset */
    if (!(MD_MNSET_SETNO(setno))) {
        mddb_setexit(s);
        return (EINVAL);
    }

    /* Re-verify that set is not stale */
    if (md_get_setstatus(setno) & MD_SET_STALE) {
        mddb_setexit(s);
        return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
    }

    lbp = s->s_lbp;
    lnp = s->s_lnp;

    /*
     * Previous master could have died during the write of data to
     * the mddbs so that the ondisk mddbs may not be consistent.
     * So, need to check the contents of the first and last active mddb
     * to see if the mddbs need to be rewritten.
     */
    for (li = 0; li < lbp->lb_loccnt; li++) {
        int checkcopy_err;

        lp = &lbp->lb_locators[li];
        /* Find replica that is active */
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        mddbs_present = 1;
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        if (s->s_mbiarray[li] == NULL)
            continue;
        /* Check locator block */
        mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
            KM_SLEEP);
        /* read in on-disk locator block */
        err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);

        /* If err, try next mddb */
        if (err) {
            kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
            continue;
        }

        /*
         * We resnarf all changelog entries for this set.
         * They may have been altered by the previous master
         */
        for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
            for (dep = dbp->db_firstentry; dep; dep =
                dep->de_next) {
                if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
                    continue;
                }
                /*
                 * This has been alloc'ed while
                 * joining the set
                 */
                if (dep->de_rb) {
                    kmem_free(dep->de_rb, dep->de_recsize);
                    dep->de_rb = (mddb_rb32_t *)NULL;
                }
                if (dep->de_rb_userdata) {
                    kmem_free(dep->de_rb_userdata,
                        dep->de_reqsize);
                    dep->de_rb_userdata = (caddr_t)NULL;
                }

                err = getrecord(s, dep, li);
                if (err) {
                    /*
                     * When we see on error while reading
                     * the changelog entries, we move on
                     * to the next mddb
                     */
                    err = 1;
                    break; /* out of inner for-loop */
                }
                allocuserdata(dep);
            }
            if (err)
                break; /* out of outer for-loop */
        }

        /* If err, try next mddb */
        if (err) {
            kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
            continue;
        }

        /* Is incore locator block same as ondisk? */
        if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
            == 1) {
            write_out_mddb = 1;
            kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
            break;
        }

        kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));

        /* If lb ok, check locator names */
        mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
            KM_SLEEP);
        /* read in on-disk locator names */
        err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
            lbp->lb_lnblkcnt, li);

        /* If err, try next mddb */
        if (err) {
            kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
            continue;
        }

        /* Are incore locator names same as ondisk? */
        if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
            == 1) {
            kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
            write_out_mddb = 1;
            break;
        }

        kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));

        /*
         * Check records in mddb.
         * If a read error is encountered, set the error flag and
         * continue to the next mddb.  Otherwise, if incore data is
         * different from ondisk, then set the flag to write out
         * the mddb and break out.
         */
        checkcopy_err = checkcopy(s, li);
        if (checkcopy_err == MDDB_F_EREAD) {
            lp->l_flags |= MDDB_F_EREAD;
            mddb_err = 1;
            continue;
        } else if (checkcopy_err == 1) {
            write_out_mddb = 1;
            break;
        }
        /*
         * Have found first active mddb and the data is the same as
         * incore - break out of loop
         */
        write_out_mddb = 0;
        break;
    }

    /*
     * Skip checking for last active mddb if:
     *  - already found a mismatch in the first active mddb
     *      (write_out_mddb is 1)  OR
     *  - didn't find a readable mddb when looking for first
     *    active mddb (there are mddbs present but all failed
     *    when read was attempted).
     *
     * In either case, go to write_out_mddb label in order to attempt
     * to write out the data. If < 50% mddbs are available, panic.
     */
    if ((write_out_mddb == 1) ||
        ((li == lbp->lb_loccnt) && mddbs_present)) {
        write_out_mddb = 1;
        goto write_out_mddb;
    }

    /*
     * Save which index was checked for the first active mddb.  If only 1
     * active mddb, don't want to recheck the same mddb when looking for
     * last active mddb.
     */
    prev_li = li;

    /*
     * Now, checking for last active mddb.  If found same index as before
     * (only 1 active mddb), then skip.
     */
    for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
        int checkcopy_err;

        lp = &lbp->lb_locators[li];
        /* Find replica that is active */
        if (! (lp->l_flags & MDDB_F_ACTIVE))
            continue;
        if (lp->l_flags & MDDB_F_DELETED)
            continue;
        if (s->s_mbiarray[li] == NULL)
            continue;
        /* If already checked mddb, bail out */
        if (li == prev_li)
            break;
        /* Check locator block */
        mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
            KM_SLEEP);
        /* read in on-disk locator block */
        err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);

        /* If err, try next mddb */
        if (err) {
            kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
            continue;
        }


        /* Is incore locator block same as ondisk? */
        if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
            == 1) {
            kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
            write_out_mddb = 1;
            break;
        }

        kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));

        /* If lb ok, check locator names */
        mnlnp_od = (mddb_mnln_t *)
            kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);

        /* read in on-disk locator names */
        err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
            lbp->lb_lnblkcnt, li);

        /* If err, try next mddb */
        if (err) {
            kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
            continue;
        }

        /* Are incore locator names same as ondisk? */
        if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
            == 1) {
            kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
            write_out_mddb = 1;
            break;
        }

        kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));

        /*
         * Check records in mddb.
         * If a read error is encountered, set the error flag and
         * continue to the next mddb.  Otherwise, if incore data is
         * different from ondisk, then set the flag to write out
         * the mddb and break out.
         */
        checkcopy_err = checkcopy(s, li);
        if (checkcopy_err == MDDB_F_EREAD) {
            lp->l_flags |= MDDB_F_EREAD;
            mddb_err = 1;
            continue;
        } else if (checkcopy_err == 1) {
            write_out_mddb = 1;
            break;
        }
        /*
         * Have found last active mddb and the data is the same as
         * incore - break out of loop
         */
        write_out_mddb = 0;
        break;
    }

    /*
     * If ondisk and incore versions of the mddb don't match, then
     * write out this node's incore version to disk.
     * Or, if unable to read a copy of the mddb, attempt to write
     * out a new one.
     */
write_out_mddb:
    if (write_out_mddb) {
        /* Recompute free blocks based on incore information */
        computefreeblks(s); /* set up free block bits */

        /*
         * Write directory entries and record blocks.
         * Use flag MDDB_WRITECOPY_SYNC so that writecopy
         * routine won't write out change log records.
         */
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];
            /* Don't write to inactive or deleted mddbs */
            if (! (lp->l_flags & MDDB_F_ACTIVE))
                continue;
            if (lp->l_flags & MDDB_F_DELETED)
                continue;
            if (s->s_mbiarray[li] == NULL)
                continue;
            /* If encounter a write error, save it for later */
            if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
                lp->l_flags |= MDDB_F_EWRITE;
                mddb_err = 1;
            }
        }

        /*
         * Write out locator blocks to all replicas.
         * push_lb will set MDDB_F_EWRITE on replicas that fail.
         */
        if (push_lb(s))
            mddb_err = 1;
        (void) upd_med(s, "mddb_check_write_ioctl(0)");

        /* Write out locator names to all replicas */
        lnp = s->s_lnp;
        uniqtime32(&lnp->ln_timestamp);
        lnp->ln_revision = MDDB_REV_MNLN;
        crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);

        /* writeall sets MDDB_F_EWRITE if writes fails to replica */
        if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
            lbp->lb_lnblkcnt, 0))
            mddb_err = 1;

        /*
         * The writes to the replicas above would have set
         * the MDDB_F_EWRITE flags if any write error was
         * encountered.
         * If < 50% of the mddbs are available, panic.
         */
        lc = alc = 0;
        for (li = 0; li < lbp->lb_loccnt; li++) {
            lp = &lbp->lb_locators[li];
            if (lp->l_flags & MDDB_F_DELETED)
                continue;
            lc++;
            /*
             * If mddb:
             *  - is not active (previously had an error)
             *  - had an error reading the master blocks  or
             *  - had an error in writing to the mddb
             * then don't count this mddb in the active count.
             */
            if (! (lp->l_flags & MDDB_F_ACTIVE) ||
                (lp->l_flags & MDDB_F_EMASTER) ||
                (lp->l_flags & MDDB_F_EWRITE))
                continue;
            alc++;
        }
        if (alc < ((lc + 1) / 2)) {
            cmn_err(CE_PANIC,
                "md: Panic due to lack of DiskSuite state\n"
                " database replicas. Fewer than 50%% of "
                "the total were available,\n so panic to "
                "ensure data integrity.");
        }
    }

    /*
     * If encountered an error during checking or writing of
     * mddbs, call selectreplicas so that replica error can
     * be properly handled. This will involve another attempt
     * to write the mddb out to any mddb marked MDDB_F_EWRITE.
     * If mddb still fails, it will have the MDDB_F_ACTIVE bit
     * turned off. Set the MDDB_SCANALLSYNC flag so that
     * selectreplicas doesn't overwrite the change log entries.
     *
     * Set the PARSE_LOCBLK flag in the mddb_set structure to show
     * that the locator block has been changed.
     */
    if (mddb_err) {
        (void) selectreplicas(s, MDDB_SCANALLSYNC);
        s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
    }

write_out_end:
    mddb_setexit(s);
    return (rval);
}

/*
 * Set/reset/get set flags in set structure.
 * Used during reconfig cycle
 * Only supported for MN disksets.
 *
 * Return values:
 *  0 - success
 *  non-zero - failure
 */
int
mddb_setflags_ioctl(mddb_setflags_config_t *info)
{
    set_t           setno = info->sf_setno;

    /* Verify that setno is in valid range */
    if (setno >= md_nsets)
        return (EINVAL);

    /*
     * When setting the flags, the set may not
     * be snarfed yet. So, don't check for SNARFED or MNset
     * and don't call mddb_setenter.
     * In order to discourage bad ioctl calls,
     * verify that magic field in structure is set correctly.
     */
    if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
        return (EINVAL);

    switch (info->sf_flags) {
    case MDDB_NM_SET:
        if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
            md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
        if (info->sf_setflags & MD_SET_MN_START_RC)
            md_set_setstatus(setno, MD_SET_MN_START_RC);
        if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
            md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
        break;

    case MDDB_NM_RESET:
        if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
            md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
        if (info->sf_setflags & MD_SET_MN_START_RC)
            md_clr_setstatus(setno, MD_SET_MN_START_RC);
        if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
            md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
        break;

    case MDDB_NM_GET:
        info->sf_setflags = md_get_setstatus(setno) &
            (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
            MD_SET_MN_MIR_STATE_RC);
        break;
    }

    return (0);
}

/*
 * md_update_minor
 *
 * This function updates the minor in the namespace entry for an
 * underlying metadevice.  The function is called in mod_imp_set
 * where mod is sp, stripe, mirror and raid.
 *
 */
int
md_update_minor(
    set_t   setno,
    side_t  side,
    mdkey_t key
)
{
    struct nm_next_hdr  *nh;
    struct nm_name      *n;
    char            *shn;
    int         retval = 1;
    side_t          s;

    /*
     * Load the devid name space if it exists
     */
    (void) md_load_namespace(setno, NULL, NM_DEVID);
    if (! md_load_namespace(setno, NULL, 0L)) {
        /*
         * Unload the devid namespace
         */
        (void) md_unload_namespace(setno, NM_DEVID);
        return (0);
    }

    rw_enter(&nm_lock.lock, RW_READER);

    if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
        retval = 0;
        goto out;
    }

    /*
     * Look up the key
     */
    for (s = 0; s < MD_MAXSIDES; s++) {
        /*
         * For side other than the import 'side', cleanup its entry
         */
        if ((n = lookup_entry(nh, setno, s, key, NODEV64, 0L)) !=
            NULL) {
            if (n->n_side == side) {
                /*
                 * Update its n_minor if metadevice
                 */
                if (((shn = (char *)getshared_name(setno,
                    n->n_drv_key, 0L)) != NULL) &&
                    (strcmp(shn, "md") == 0)) {
                    n->n_minor = MD_MKMIN(setno,
                        MD_MIN2UNIT(n->n_minor));
                }
            } else {
                /* We are not the import side, cleanup */
                (void) remove_entry(nh, n->n_side, key, 0L);
            }
        }
    }

out:
    rw_exit(&nm_lock.lock);
    return (retval);
}

/*
 * md_update_top_device_minor
 *
 * This function updates the minor in the namespace entry for a top
 * level metadevice.  The function is called in mod_imp_set where
 * mod is sp, stripe, mirror and raid.
 *
 */
int
md_update_top_device_minor(
    set_t   setno,
    side_t  side,
    md_dev64_t dev
)
{
    struct nm_next_hdr  *nh;
    struct nm_name      *n;
    char            *shn;
    int         retval = 1;

    /*
     * Load the devid name space if it exists
     */
    (void) md_load_namespace(setno, NULL, NM_DEVID);
    if (! md_load_namespace(setno, NULL, 0L)) {
        /*
         * Unload the devid namespace
         */
        (void) md_unload_namespace(setno, NM_DEVID);
        return (0);
    }

    rw_enter(&nm_lock.lock, RW_READER);

    if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
        retval = 0;
        goto out;
    }

    /*
     * Look up the key
     */
    if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) {
        /*
         * Find the entry, update its n_minor if metadevice
         */
        if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
            == NULL) {
            retval = 0;
            goto out;
        }

        if (strcmp(shn, "md") == 0) {
            n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
        }
    }

out:
    rw_exit(&nm_lock.lock);
    return (retval);
}

static void
md_imp_nm(
    mddb_set_t  *s
)
{
    mddb_db_t       *dbp;
    mddb_de_ic_t        *dep;
    struct nm_rec_hdr   *hdr;
    struct nm_header    *hhdr;
    set_t           setno = s->s_setno;

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry; dep != NULL;
            dep = dep->de_next) {
            switch (dep->de_type1) {

            case MDDB_NM_HDR:
            case MDDB_DID_NM_HDR:

                hhdr = (struct nm_header *)
                    dep->de_rb_userdata;

                hdr = &hhdr->h_names;
                if (hdr->r_next_recid > 0) {
                    hdr->r_next_recid = MAKERECID(setno,
                        DBID(hdr->r_next_recid));
                }

                hdr = &hhdr->h_shared;
                if (hdr->r_next_recid > 0) {
                    hdr->r_next_recid = MAKERECID(setno,
                        DBID(hdr->r_next_recid));
                }
                break;

            case MDDB_NM:
            case MDDB_DID_NM:
            case MDDB_SHR_NM:
            case MDDB_DID_SHR_NM:

                hdr = (struct nm_rec_hdr *)
                    dep->de_rb_userdata;

                if (hdr->r_next_recid > 0) {
                    hdr->r_next_recid = MAKERECID
                        (setno, DBID(hdr->r_next_recid));
                }
                break;

            default:
                break;
            }
        }
    }
}

static int
update_db_rec(
    mddb_set_t  *s
)
{
    mddb_db_t   *dbp;
    mddb_de_ic_t    *dep;
    mddb_recid_t    ids[2];

    for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
        for (dep = dbp->db_firstentry; dep != NULL;
            dep = dep->de_next) {
            if (! (dep->de_flags & MDDB_F_OPT)) {
                ids[0] = MAKERECID(s->s_setno, dep->de_recid);
                ids[1] = 0;
                if (mddb_commitrecs(ids)) {
                    return (MDDB_E_NORECORD);
                }
            }
        }
    }
    return (0);
}

static int
update_mb(
    mddb_set_t  *s
)
{
    mddb_ri_t   *rip;
    int err = 0;

    for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
        if (rip->ri_flags & MDDB_F_EMASTER)
            /* disk is powered off or not there */
            continue;

        if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
            /*
             * It is a replicated set
             */
            if (rip->ri_devid == (ddi_devid_t)NULL) {
                return (-1);
            }
            err = update_mb_devid(s, rip, rip->ri_devid);
        } else {
            /*
             * It is a non-replicated set
             * and there is no need to update
             * devid
             */
            err = update_mb_devid(s, rip, NULL);
        }

        if (err)
            return (err);
    }

    return (0);
}

static int
update_setname(
    set_t   setno
)
{
    struct nm_next_hdr  *nh;
    struct nm_shared_name   *shn, *new_shn;
    char            *prefix = "/dev/md/";
    char            *shrname;
    int         len;
    mdkey_t         o_key;
    uint32_t        o_count, o_data;
    mddb_recid_t        recid, ids[3];
    int         err = 0;
    mddb_set_t      *dbp;

    /* Import setname */
    dbp = (mddb_set_t *)md_set[setno].s_db;
    len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
    shrname = kmem_zalloc(len, KM_SLEEP);
    (void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");

    rw_enter(&nm_lock.lock, RW_WRITER);
    if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
        /*
         * No namespace is okay
         */
        err = 0;
        goto out;
    }

    if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
        0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
        /*
         * No metadevice is okay
         */
        err = 0;
        goto out;
    }

    /*
     * We have it, go ahead and update the namespace.
     */
    o_key = shn->sn_key;
    o_count = shn->sn_count;
    o_data = shn->sn_data;

    if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
        NM_NOCOMMIT | NM_KEY_RECYCLE)) {
        err = MDDB_E_NORECORD;
        goto out;
    }
    if ((new_shn = (struct nm_shared_name *)alloc_entry(
        nh, md_set[setno].s_nmid, len, NM_SHARED |
        NM_NOCOMMIT, &recid)) == NULL) {
        err = MDDB_E_NORECORD;
        goto out;
    }

    new_shn->sn_key = o_key;
    new_shn->sn_count = o_count;
    new_shn->sn_data = o_data;
    new_shn->sn_namlen = (ushort_t)len;
    (void) strcpy(new_shn->sn_name, shrname);

    ids[0] = recid;
    ids[1] = md_set[setno].s_nmid;
    ids[2] = 0;
    err = mddb_commitrecs(ids);

out:
    if (shrname)
        kmem_free(shrname, len);
    rw_exit(&nm_lock.lock);
    return (err);
}

/*
 * Returns 0 on success.
 * Returns -1 on failure with ep filled in.
 */
static int
md_imp_db(
    set_t       setno,
    int     stale_flag,
    md_error_t  *ep
)
{
    mddb_set_t  *s;
    int     err = 0;
    mddb_dt_t   *dtp;
    mddb_lb_t   *lbp;
    int     i;
    int     loccnt;

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
        return (mddbstatus2error(ep, err, NODEV32, setno));
    }

    /* Update dt */
    if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
        crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
    }

    if ((err = dt_write(s)) != 0) {
        err = mdsyserror(ep, err);
        mddb_setexit(s);
        return (err);
    }

    /*
     * Update lb, no need to update the mediator because
     * the diskset will only exist on the importing node
     * and as such a mediator adds no value.
     */

    /* Update lb */
    if (stale_flag & MD_IMP_STALE_SET) {
        lbp = s->s_lbp;
        loccnt = lbp->lb_loccnt;
        for (i = 0; i < loccnt; i++) {
            mddb_locator_t  *lp = &lbp->lb_locators[i];
            md_dev64_t  ndev = md_expldev(lp->l_dev);
            ddi_devid_t devid_ptr;

            devid_ptr = s->s_did_icp->did_ic_devid[i];
            if (devid_ptr == NULL) {
                /*
                 * Already deleted, go to next one.
                 */
                continue;
            }
            if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev,
                NULL)) {
                /* disk unavailable, mark deleted */
                lp->l_flags = MDDB_F_DELETED;
                /* then remove the device id from the list */
                free_mbipp(&s->s_mbiarray[i]);
                (void) mddb_devid_delete(s, i);
            }
        }
        md_clr_setstatus(setno, MD_SET_STALE);
    }

    if ((err = writelocall(s)) != 0) {
        err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno);
        mddb_setexit(s);
        return (err);
    }

    mddb_setexit(s);

    /* Update db records */
    if ((err = update_db_rec(s)) != 0) {
        return (mddbstatus2error(ep, err, NODEV32, setno));
    }

    /* Update setname embedded in the namespace */
    if ((err = update_setname(setno)) != 0)
        return (mddbstatus2error(ep, err, NODEV32, setno));

    return (err);
}

static void
md_dr_add(
    md_set_record   *sr,
    md_drive_record *dr
)
{
    md_drive_record *drv;

    if (sr->sr_driverec == 0) {
        sr->sr_driverec = dr->dr_selfid;
        return;
    }

    for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
        drv->dr_nextrec != 0;
        drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
        ;
    drv->dr_nextrec = dr->dr_selfid;
}

static void
md_setup_recids(
    md_set_record   *sr,
    mddb_recid_t    **ids,
    size_t      size
)
{
    md_drive_record *drv;
    int     cnt;
    mddb_recid_t    *recids;

    recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
        * size, KM_SLEEP);
    recids[0] = sr->sr_selfid;
    cnt = 1;

    for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
        /* CSTYLED */
        drv != NULL;) {
        recids[cnt++] = drv->dr_selfid;
        if (drv->dr_nextrec != 0)
            drv = (md_drive_record *)mddb_getrecaddr
                (drv->dr_nextrec);
        else
            drv = NULL;
    }
    recids[cnt] = 0;
    *ids = &recids[0];
}

/*
 * The purpose of this function is to replace the old_devid with the
 * new_devid in the given namespace.   This is used for importing
 * remotely replicated drives.
 */
int
md_update_namespace_rr_did(
    mddb_config_t   *cp
)
{
    set_t           setno = cp->c_setno;
    struct nm_next_hdr  *nh;
    mdkey_t         key = MD_KEYWILD;
    side_t          side = MD_SIDEWILD;
    mddb_recid_t        recids[3];
    struct did_min_name *n;
    struct nm_next_hdr  *did_shr_nh;
    struct did_shr_name *shr_n;
    mdkey_t         ent_did_key;
    uint32_t        ent_did_count;
    uint32_t        ent_did_data;
    ddi_devid_t     devid = NULL;
    struct did_shr_name *shn;
    void            *old_devid, *new_devid;

    if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED))
        return (EIO);

    old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid;
    new_devid = (void *)(uintptr_t)cp->c_locator.l_devid;

    /*
     * It is okay if we dont have any configuration
     */
    if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
        == NULL) {
        return (0);
    }
    while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
        /* check out every entry in the namespace */
        if ((n = (struct did_min_name *)lookup_entry(nh, setno,
            side, key, NODEV64, NM_DEVID)) == NULL) {
            continue;
        } else {
            did_shr_nh = get_first_record(setno, 0, NM_DEVID |
                NM_SHARED);
            if (did_shr_nh == NULL) {
                return (ENOENT);
            }

            shr_n = (struct did_shr_name *)lookup_shared_entry(
                did_shr_nh, n->min_devid_key, (char *)0,
                &recids[0], NM_DEVID);
            if (shr_n == NULL) {
                return (ENOENT);
            }
            rw_enter(&nm_lock.lock, RW_WRITER);
            devid = (ddi_devid_t)shr_n->did_devid;
            /* find this devid in the incore replica  */
            if (ddi_devid_compare(devid, old_devid) == 0) {
                /*
                 * found the corresponding entry
                 * update with new devid
                 */
                /* first remove old devid info */
                ent_did_key = shr_n ->did_key;
                ent_did_count = shr_n->did_count;
                ent_did_data = shr_n->did_data;
                (void) remove_shared_entry(did_shr_nh,
                    shr_n->did_key, NULL, NM_DEVID |
                    NM_IMP_SHARED | NM_KEY_RECYCLE);

                /* add in new devid info */
                if ((shn = (struct did_shr_name *)
                    alloc_entry(did_shr_nh,
                    md_set[setno].s_did_nmid,
                    cp->c_locator.l_devid_sz,
                    NM_DEVID | NM_SHARED | NM_NOCOMMIT,
                    &recids[0])) == NULL) {
                        rw_exit(&nm_lock.lock);
                        return (ENOMEM);
                    }
                    shn->did_key = ent_did_key;
                    shn->did_count = ent_did_count;
                    ent_did_data |= NM_DEVID_VALID;
                    shn->did_data = ent_did_data;
                    shn->did_size = ddi_devid_sizeof(
                        new_devid);
                    bcopy((void *)new_devid, (void *)
                        shn->did_devid, shn->did_size);
                    recids[1] = md_set[setno].s_nmid;
                    recids[2] = 0;
                    mddb_commitrecs_wrapper(recids);
            }
            rw_exit(&nm_lock.lock);
        }
    }

    return (0);
}

/*
 * namespace is loaded before this is called.
 * This function is a wrapper for md_update_namespace_rr_did.
 *
 * md_update_namespace_rr_did may be called twice if attempting to
 * resolve a replicated device id during the take of a diskset - once
 * for the diskset namespace and a second time for the local namespace.
 * The local namespace would need to be updated when a drive has been
 * found during a take of the diskset that hadn't been resolved during
 * the import (aka partial replicated import).
 *
 * If being called during the import of the diskset (IMPORT flag set)
 * md_update_namespace_rr_did will only be called once with the disket
 * namespace.
 */
int
md_update_nm_rr_did_ioctl(
    mddb_config_t   *cp
)
{
    int rval = 0;

    /* If update of diskset namespace fails, stop and return failure */
    if ((rval = md_update_namespace_rr_did(cp)) != 0)
        return (rval);

    if (cp->c_flags & MDDB_C_IMPORT)
        return (0);

    /* If update of local namespace fails, return failure */
    cp->c_setno = MD_LOCAL_SET;
    rval = md_update_namespace_rr_did(cp);
    return (rval);
}

/*ARGSUSED*/
int
md_imp_snarf_set(
    mddb_config_t   *cp
)
{
    set_t       setno;
    int     stale_flag;
    mddb_set_t  *s;
    int     i, err = 0;
    md_ops_t    *ops;
    md_error_t  *ep = &cp->c_mde;

    setno = cp->c_setno;
    stale_flag = cp->c_flags;

    mdclrerror(ep);
    if (setno >= md_nsets) {
        return (mdsyserror(ep, EINVAL));
    }

    md_haltsnarf_enter(setno);
    if (md_get_setstatus(setno) & MD_SET_IMPORT) {
        goto out;
    }

    /* Set the bit first otherwise load_old_replicas can fail */
    md_set_setstatus(setno, MD_SET_IMPORT);

    if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
        err = mddbstatus2error(ep, err, NODEV32, setno);
        goto out;
    }

    /*
     * Upon completion of load_old_replicas, the old setno is
     * restored from the disk so we need to reset
     */
    s->s_lbp->lb_setno = setno;

    /*
     * Fixup the NM records before loading namespace
     */
    (void) md_imp_nm(s);
    mddb_setexit(s);

    /*
     * Load the devid name space if it exists
     * and ask each module to fixup unit records
     */
    if (!md_load_namespace(setno, NULL, NM_DEVID)) {
        err = mdsyserror(ep, ENOENT);
        goto cleanup;
    }
    if (!md_load_namespace(setno, NULL, 0L)) {
        (void) md_unload_namespace(setno, NM_DEVID);
        err = mdsyserror(ep, ENOENT);
        goto cleanup;
    }

    do {
        i = 0;
        for (ops = md_opslist; ops != NULL; ops = ops->md_next)
            if (ops->md_imp_set != NULL)
                i += ops->md_imp_set(setno);
    } while (i);

    /*
     * Fixup
     *  (1) locator block
     *  (2) locator name block if necessary
     *  (3) master block
     *  (4) directory block
     * calls appropriate writes to push changes out
     */
    if ((err = md_imp_db(setno, stale_flag, ep)) != 0) {
        goto cleanup;
    }

    /*
     * Don't unload namespace if importing a replicated diskset.
     * Namespace will be unloaded with an explicit RELEASE_SET ioctl.
     */
    if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
        md_haltsnarf_exit(setno);
        return (err);
    }

cleanup:
    /*
     * Halt the set
     */
    rw_enter(&md_unit_array_rw.lock, RW_WRITER);
    (void) md_halt_set(setno, MD_HALT_ALL);
    rw_exit(&md_unit_array_rw.lock);

    /*
     * Unload the namespace for the imported set
     */
    mutex_enter(&mddb_lock);
    mddb_unload_set(setno);
    mutex_exit(&mddb_lock);

out:
    md_haltsnarf_exit(setno);
    md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
    return (err);
}
#endif  /* MDDB_FAKE */