libmeta/common/meta_set_drv.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * Metadevice diskset interfaces
 */

#include <meta.h>
#include <mdmn_changelog.h>
#include "meta_set_prv.h"
#include "meta_repartition.h"

static int
check_setnodes_againstdrivelist(
    mdsetname_t     *sp,
    mddrivenamelist_t   *dnlp,
    md_error_t      *ep
)
{
    md_set_desc     *sd;
    mddrivenamelist_t   *p;
    int             i;
    md_mnnode_desc      *nd;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            for (p = dnlp; p != NULL; p = p->next)
                if (checkdrive_onnode(sp, p->drivenamep,
                    nd->nd_nodename, ep))
                    return (-1);
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            for (p = dnlp; p != NULL; p = p->next)
                if (checkdrive_onnode(sp, p->drivenamep,
                    sd->sd_nodes[i], ep))
                    return (-1);
        }
    }
    return (0);
}

static int
drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
{
    mddrivenamelist_t *dl1, *dl2;
    mddrivename_t *dn1, *dn2;

    for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
        dn1 = dl1->drivenamep;

        for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
            dn2 = dl2->drivenamep;
            if (strcmp(dn1->cname, dn2->cname) != 0)
                continue;

            return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
                NULL, dn1->cname, sp->setname));
        }
    }
    return (0);
}

static md_drive_desc *
metaget_drivedesc_fromdrivelist(
    mdsetname_t     *sp,
    mddrivenamelist_t   *dnlp,
    uint_t          flags,
    md_error_t      *ep
)
{
    mddrivenamelist_t   *p;
    md_drive_desc       *dd = NULL;
    md_set_desc     *sd;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (NULL);

    for (p = dnlp; p != NULL; p = p->next) {
        (void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
            sd->sd_ctime, sd->sd_genid, flags);
    }

    return (dd);
}

/*
 * Exported Entry Points
 */

int
meta_make_sidenmlist(
    mdsetname_t     *sp,
    mddrivename_t       *dnp,
    int         import_flag, /* flags partial import */
    md_im_drive_info_t  *midp,  /* import drive information */
    md_error_t      *ep
)
{
    mdsidenames_t       *sn, **sn_next;
    mdname_t        *np;
    int         done;
    side_t          sideno = MD_SIDEWILD;
    uint_t          rep_slice;
    char            *bname;

    if (!import_flag) {
        /*
         * Normal (aka NOT partial import) code path.
         */
        if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
            return (-1);
        }

        dnp->side_names_key = MD_KEYWILD;

        if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
            return (-1);
        bname = Strdup(np->bname);
    } else {
        /*
         * When doing a partial import, we'll get the needed
         * information from somewhere other than the system.
         */
        dnp->side_names_key = MD_KEYWILD;
        bname = Strdup(midp->mid_devname);
    }
    metaflushsidenames(dnp);
    sn_next = &dnp->side_names;
    /*CONSTCOND*/
    while (1) {
        sn = Zalloc(sizeof (*sn));

        if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
            &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
            if (import_flag) {
                mdclrerror(ep);
                sn->dname = Strdup(midp->mid_driver_name);
                sn->mnum = midp->mid_mnum;
            } else {
                Free(sn);
                Free(bname);
                return (-1);
            }
        }

        if (done == 0) {
            Free(sn);
            Free(bname);
            return (0);
        }

        sn->sideno = sideno;

        /* Add to the end of the linked list */
        assert(*sn_next == NULL);
        *sn_next = sn;
        sn_next = &sn->next;
    }
    /*NOTREACHED*/
}

int
meta_set_adddrives(
    mdsetname_t     *sp,
    mddrivenamelist_t   *dnlp,
    daddr_t         dbsize,
    int         force_label,
    md_error_t      *ep
)
{
    md_set_desc     *sd;
    md_drive_desc       *dd = NULL, *curdd = NULL, *ddp;
    int         i;
    mddrivenamelist_t   *p;
    mhd_mhiargs_t       mhiargs;
    int         rval = 0;
    md_timeval32_t      now;
    sigset_t        oldsigs;
    ulong_t         genid;
    ulong_t         max_genid = 0;
    md_setkey_t     *cl_sk;
    int         rb_level = 0;
    md_error_t      xep = mdnullerror;
    md_mnnode_desc      *nd;
    int         suspendall_flag = 0;
    int         suspend1_flag = 0;
    int         lock_flag = 0;
    int         flush_set_onerr = 0;
    md_replicalist_t    *rlp = NULL, *rl;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    /* Make sure we own the set */
    if (meta_check_ownership(sp, ep) != 0)
        return (-1);

    /*
     * The drive and node records are stored in the local mddbs of each
     * node in the diskset.  Each node's rpc.metad daemon reads in the set,
     * drive and node records from that node's local mddb and caches them
     * internally. Any process needing diskset information contacts its
     * local rpc.metad to get this information.  Since each node in the
     * diskset is independently reading the set information from its local
     * mddb, the set, drive and node records in the local mddbs must stay
     * in-sync, so that all nodes have a consistent view of the diskset.
     *
     * For a multinode diskset, explicitly verify that all nodes in the
     * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
     * fail this operation since all nodes must be ALIVE in order to add
     * the new drive record to their local mddb.  If a panic of this node
     * leaves the local mddbs set, node and drive records out-of-sync, the
     * reconfig cycle will fix the local mddbs and force them back into
     * synchronization.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
                    sp->setno,
                    nd->nd_nodename, NULL, sp->setname);
                return (-1);
            }
            nd = nd->nd_next;
        }
    }

    if (drvsuniq(sp, dnlp, ep) == -1)
        return (-1);

    /*
     * Lock the set on current set members.
     * Set locking done much earlier for MN diskset than for traditional
     * diskset since lock_set and SUSPEND are used to protect against
     * other meta* commands running on the other nodes.
     */
    if (MD_MNSET_DESC(sd)) {
        /* Make sure we are blocking all signals */
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);

        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
                rval = -1;
                goto out;
            }
            lock_flag = 1;
            nd = nd->nd_next;
        }
        /*
         * Lock out other meta* commands by suspending
         * class 1 messages across the diskset.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename,
                COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
                MD_MSCF_NO_FLAGS, ep)) {
                rval = -1;
                goto out;
            }
            suspend1_flag = 1;
            nd = nd->nd_next;
        }
    }

    if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
        rval = -1;
        goto out;
    }

    for (p = dnlp; p != NULL; p = p->next) {
        mdsetname_t *tmp;

        if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
            ep) == -1) {
            rval = -1;
            goto out;
        }

        if (tmp != NULL) {
            (void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
                tmp->setname, p->drivenamep->cname, sp->setname);
            rval = -1;
            goto out;
        }
    }

    /* END CHECK CODE */

    /*
     * This is a separate loop (from above) so that we validate all the
     * drives handed to us before we repartition any one drive.
     */
    for (p = dnlp; p != NULL; p = p->next) {
        if (meta_repartition_drive(sp,
            p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
            NULL, /* Don't return the VTOC. */
            ep) != 0) {
            rval = -1;
            goto out;
        }
        /*
         * Create the names for the drives we are adding per side.
         */
        if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
            ep) == -1) {
            rval = -1;
            goto out;
        }
    }

    /*
     * Get the list of drives descriptors that we are adding.
     */
    dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);

    if (! mdisok(ep)) {
        rval = -1;
        goto out;
    }

    /*
     * Get the set timeout information.
     */
    (void) memset(&mhiargs, '\0', sizeof (mhiargs));
    if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
        rval = -1;
        goto out;
    }

    /*
     * Get timestamp and generation id for new records
     */
    now = sd->sd_ctime;
    genid = sd->sd_genid;


    /* At this point, in case of error, set should be flushed. */
    flush_set_onerr = 1;

    /* Lock the set on current set members */
    if (!(MD_MNSET_DESC(sd))) {
        md_rb_sig_handling_on();
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
                rval = -1;
                goto out;
            }
            lock_flag = 1;
        }
    }

    /*
     * Get drive descriptors for the drives that are currently in the set.
     */
    curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
    if (! mdisok(ep))
        goto rollback;

    /*
     * If first drive being added to set, set the mastership
     * of the multinode diskset to be this node.
     * Only set it on this node.  If all goes well
     * and there are no errors, the mastership of this node will be set
     * on all nodes in user space and in the kernel.
     */
    if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
        if (clnt_mnsetmaster(mynode(), sp,
            sd->sd_mn_mynode->nd_nodename,
            sd->sd_mn_mynode->nd_nodeid, ep)) {
            goto rollback;
        }
        /*
         * Set this up in my local cache of the set desc so that
         * the set descriptor won't have to be gotten again from
         * rpc.metad.  If it is flushed and gotten again, these
         * values will be set in sr2setdesc.
         */
        sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
        (void) strcpy(sd->sd_mn_master_nodenm,
            sd->sd_mn_mynode->nd_nodename);
        sd->sd_mn_am_i_master = 1;
    }

    RB_TEST(1, "adddrives", ep)

    RB_PREEMPT;
    rb_level = 1;   /* level 1 */

    RB_TEST(2, "adddrives", ep)

    /*
     * Add the drive records for the drives that we are adding to
     * each host in the set.  Marks the drive as MD_DR_ADD.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
                ep) == -1)
                goto rollback;

            RB_TEST(3, "adddrives", ep)
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
                ep) == -1)
                goto rollback;

            RB_TEST(3, "adddrives", ep)
        }
    }

    RB_TEST(4, "adddrives", ep)

    RB_PREEMPT;
    rb_level = 2;   /* level 2 */

    RB_TEST(5, "adddrives", ep)

    /*
     * Take ownership of the added drives.
     */
    if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
        if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
            goto rollback;
    }

    /*
     * If this is not a MN set and the state flags do not indicate the
     * presence of devids, update the set records on all nodes.
     */
    if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) {
        if (meta_update_mb(sp, dd, ep) == 0) {
            mdclrerror(ep);

            /* update the sr_flags on all hosts */
            for (i = 0; i < MD_MAXSIDES; i++) {
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_upd_sr_flags(sd->sd_nodes[i],
                    sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
                    goto rollback;
            }
        }
    }

    RB_TEST(6, "adddrives", ep)

    RB_PREEMPT;
    rb_level = 3;   /* level 3 */

    RB_TEST(7, "adddrives", ep)

    /*
     * Balance the DB's according to the list of existing drives and the
     * list of added drives.
     */
    if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
        goto rollback;

    /*
     * Slam a dummy master block on all the disks that we are adding
     * that don't have replicas on them.
     * Used by diskset import if the disksets are remotely replicated
     */
    if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
        for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
            uint_t      rep_slice;
            int     fd = -1;
            mdname_t    *np = NULL;
            char        *drive_name;

            drive_name = ddp->dd_dnp->cname;

            for (rl = rlp; rl != NULL; rl = rl->rl_next) {
                char    *rep_name;

                rep_name =
                    rl->rl_repp->r_namep->drivenamep->cname;

                if (strcmp(drive_name, rep_name) == 0) {
                    /*
                     * Disk has a replica on it so don't
                     * add dummy master block.
                     */
                    break;
                }
            }
            if (rl == NULL) {
                /*
                 * Drive doesn't have a replica on it so
                 * we need a dummy master block. Add it.
                 */
                if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
                    &xep) != 0) {
                    mdclrerror(&xep);
                    continue;
                }

                if ((np = metaslicename(ddp->dd_dnp, rep_slice,
                    &xep)) == NULL) {
                    mdclrerror(&xep);
                    continue;
                }

                if ((fd = open(np->rname, O_RDWR)) >= 0) {
                    meta_mkdummymaster(sp, fd, 16);
                    (void) close(fd);
                }
            }
        }
    }

    if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
        /*
         * Notify rpc.mdcommd on all nodes of a nodelist change.
         * Start by suspending rpc.mdcommd (which drains it of all
         * messages), then change the nodelist followed by a reinit
         * and resume.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
                rval = -1;
                goto out;
            }
            suspendall_flag = 1;
            nd = nd->nd_next;
        }
    }

    /*
     * If a MN diskset and this is the first disk(s) being added
     * to set, then pre-allocate change log records here.
     * When the other nodes are joined into the MN diskset, the
     * USER records will just be snarfed in.
     */
    if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
        if (mdmn_allocate_changelog(sp, ep) != 0)
            goto rollback;
    }

    /*
     * Mark the drives MD_DR_OK.
     * If first drive being added to MN diskset, then set
     * master on all nodes to be this node and then join
     * all alive nodes (nodes in membership list) to set.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /* don't set master on this node - done earlier */
            if ((curdd == NULL) && (nd->nd_nodeid !=
                sd->sd_mn_mynode->nd_nodeid)) {
                /*
                 * Set master on all alive nodes since
                 * all alive nodes will become joined nodes.
                 */
                if (clnt_mnsetmaster(nd->nd_nodename, sp,
                    sd->sd_mn_mynode->nd_nodename,
                    sd->sd_mn_mynode->nd_nodeid, ep)) {
                    goto rollback;
                }
            }

            if (curdd == NULL) {
                /*
                 * No special flags for join set.  Since
                 * all nodes are joining if 1st drive is being
                 * added to set then all nodes will be either
                 * STALE or non-STALE and each node can
                 * determine this on its own.
                 */
                if (clnt_joinset(nd->nd_nodename, sp,
                    NULL, ep)) {
                    goto rollback;
                }
                /* Sets join node flag on all nodes in list */
                if (clnt_upd_nr_flags(nd->nd_nodename, sp,
                    sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
                    goto rollback;
                }
            }

            /*
             * Set MD_DR_OK as last thing before unlock.
             * In case of panic on this node, recovery
             * code can check for MD_DR_OK to determine
             * status of diskset.
             */
            if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
                MD_DR_OK, ep) == -1)
                goto rollback;


            RB_TEST(8, "adddrives", ep)
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
                ep) == -1)
                goto rollback;

            RB_TEST(8, "adddrives", ep)
        }
    }

    RB_TEST(9, "adddrives", ep)

out:
    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * Send reinit command to mdcommd which forces it to get
     * fresh set description.
     */
    if (suspendall_flag) {
        /* Send reinit */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /* Class is ignored for REINIT */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
                sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to reinit rpc.mdcommd.\n"));
            }
            nd = nd->nd_next;
        }
    }
    /*
     * Unlock diskset by resuming messages across the diskset.
     * Just resume all classes so that resume is the same whether
     * just one class was locked or all classes were locked.
     */
    if ((suspend1_flag) || (suspendall_flag)) {
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }

    if (lock_flag) {
        cl_sk = cl_get_setkey(sp->setno, sp->setname);
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                if (clnt_unlock_set(nd->nd_nodename,
                    cl_sk, &xep)) {
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
                nd = nd->nd_next;
            }
        } else {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_unlock_set(sd->sd_nodes[i],
                    cl_sk, &xep)) {
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
            }
        }
        cl_set_setkey(NULL);
    }

    metafreedrivedesc(&dd);

    if (flush_set_onerr) {
        metaflushsetname(sp);
        if (!(MD_MNSET_DESC(sd))) {
            md_rb_sig_handling_off(md_got_sig(), md_which_sig());
        }
    }

    if (MD_MNSET_DESC(sd)) {
        /* release signals back to what they were on entry */
        if (procsigs(FALSE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    }

    return (rval);

rollback:
    /* all signals already blocked for MN disket */
    if (!(MD_MNSET_DESC(sd))) {
        /* Make sure we are blocking all signals */
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    }

    rval = -1;

    max_genid = sd->sd_genid;

    /* level 3 */
    if (rb_level > 2) {
        /*
         * Since the add drive operation is failing, need
         * to reset config back to the way it was
         * before the add drive opration.
         * If a MN diskset and this is the first drive being added,
         * then reset master on all ALIVE nodes (which is all nodes)
         * since the master would have not been set previously.
         * Don't reset master on this node, since this
         * is done later.
         * This is ok to fail since next node to add first
         * disk to diskset will also set the master on all nodes.
         *
         * Also, if this is the first drive being added,
         * need to have each node withdraw itself from the set.
         */
        if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                /*
                 * Be careful with ordering in case of
                 * panic between the steps and the
                 * effect on recovery during reconfig.
                 */
                if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
                    mdclrerror(&xep);

                /* Sets withdraw flag on all nodes in list */
                if (clnt_upd_nr_flags(nd->nd_nodename, sp,
                    sd->sd_nodelist, MD_NR_WITHDRAW,
                    NULL, &xep)) {
                    mdclrerror(&xep);
                }

                /* Skip this node */
                if (nd->nd_nodeid ==
                    sd->sd_mn_mynode->nd_nodeid) {
                    nd = nd->nd_next;
                    continue;
                }
                /* Reset master on all of the other nodes. */
                if (clnt_mnsetmaster(nd->nd_nodename, sp,
                    "", MD_MN_INVALID_NID, &xep))
                    mdclrerror(&xep);
                nd = nd->nd_next;
            }
        }
    }

    /*
     * Send resume command to mdcommd.  Don't send reinit command
     * since nodelist should not have changed.
     * If suspendall_flag is set, then user would have been adding
     * first drives to set.  Since this failed, there is certainly
     * no reinit message to send to rpc.commd since no nodes will
     * be joined to set at the end of this metaset command.
     */
    if (suspendall_flag) {
        /* Send resume */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /*
             * Resume all classes but class 1 so that lock is held
             * against meta* commands.
             * To later resume class1, must issue a class0 resume.
             */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0,
                MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
                mde_perror(&xep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
                mdclrerror(&xep);
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }

    /* level 3 */
    if (rb_level > 2) {
        mdnamelist_t    *nlp;
        mdname_t    *np;

        for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
            uint_t  rep_slice;

            if ((meta_replicaslice(ddp->dd_dnp,
                &rep_slice, &xep) != 0) ||
                ((np = metaslicename(ddp->dd_dnp, rep_slice,
                &xep)) == NULL)) {
                mdclrerror(&xep);
                continue;
            }
            nlp = NULL;
            (void) metanamelist_append(&nlp, np);

            if (meta_db_detach(sp, nlp,
                (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
                mdclrerror(&xep);

            metafreenamelist(nlp);
        }

        /* Re-balance */
        if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
            mdclrerror(&xep);

        /* Only if we are adding the first drive */
        /* Handled MN diskset above. */
        if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
            if (clnt_stimeout(mynode(), sp, &defmhiargs,
                &xep) == -1)
                mdclrerror(&xep);

            /* This is needed because of a corner case */
            if (halt_set(sp, &xep))
                mdclrerror(&xep);
        }
        max_genid++;
    }

    /* level 2 */
    if (rb_level > 1) {
        if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
            if (rel_own_bydd(sp, dd, TRUE, &xep))
                mdclrerror(&xep);
        }
    }

    /* level 1 */
    if (rb_level > 0) {
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                if (clnt_deldrvs(nd->nd_nodename, sp, dd,
                    &xep) == -1)
                    mdclrerror(&xep);
                nd = nd->nd_next;
            }
        } else {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
                    &xep) == -1)
                    mdclrerror(&xep);
            }
        }
        max_genid += 2;
        resync_genid(sp, sd, max_genid, 0, NULL);
    }

    if ((suspend1_flag) || (suspendall_flag)) {
        /* Send resume */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /*
             * Just resume all classes so that resume is the
             * same whether just one class was locked or all
             * classes were locked.
             */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                mdclrerror(&xep);
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }

    /* level 0 */
    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    /* Don't test lock flag since guaranteed to be set if in rollback */
    if (MD_MNSET_DESC(sd)) {
        /*
         * Since the add drive operation is failing, need
         * to reset config back to the way it was
         * before the add drive opration.
         * If a MN diskset and this is the first drive being
         * added, then reset master on this node since
         * the master would have not been set previously.
         * This is ok to fail since next node to add first
         * disk to diskset will also set the master on all nodes.
         */
        if (curdd == NULL) {
            /* Reset master on mynode */
            if (clnt_mnsetmaster(mynode(), sp, "",
                MD_MN_INVALID_NID, &xep))
                mdclrerror(&xep);
        }
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
                mdclrerror(&xep);
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
                mdclrerror(&xep);
        }
    }
    cl_set_setkey(NULL);

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    metafreedrivedesc(&dd);

    if (flush_set_onerr) {
        metaflushsetname(sp);
        if (!(MD_MNSET_DESC(sd))) {
            md_rb_sig_handling_off(md_got_sig(), md_which_sig());
        }
    }

    return (rval);
}

/*
 * Add drives routine used during import of a diskset.
 */
int
meta_imp_set_adddrives(
    mdsetname_t     *sp,
    mddrivenamelist_t   *dnlp,
    md_im_set_desc_t    *misp,
    md_error_t      *ep
)
{
    md_set_desc     *sd;
    mddrivenamelist_t   *p;
    md_drive_desc       *dd = NULL, *ddp;
    int         flush_set_onerr = 0;
    md_timeval32_t      now;
    ulong_t         genid;
    mhd_mhiargs_t       mhiargs;
    md_im_replica_info_t    *mirp;
    md_im_drive_info_t  *midp;
    int         rval = 0;
    sigset_t        oldsigs;
    ulong_t         max_genid = 0;
    int         rb_level = 0;
    md_error_t      xep = mdnullerror;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    for (p = dnlp; p != NULL; p = p->next) {
        int     imp_flag = 0;

        /*
         * If we have a partial diskset, meta_make_sidenmlist will
         * need information from midp to complete making the
         * side name structure.
         */
        if (misp->mis_partial) {
            imp_flag = MDDB_C_IMPORT;
            for (midp = misp->mis_drives; midp != NULL;
                midp = midp->mid_next) {
                if (midp->mid_dnp == p->drivenamep)
                    break;
            }
            if (midp == NULL) {
                (void) mddserror(ep, MDE_DS_SETNOTIMP,
                    MD_SET_BAD, mynode(), NULL, sp->setname);
                rval = -1;
                goto out;
            }
        }
        /*
         * Create the names for the drives we are adding per side.
         */
        if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
            midp, ep) == -1) {
            rval = -1;
            goto out;
        }
    }

    /*
     * Get the list of drives descriptors that we are adding.
     */
    dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);

    if (! mdisok(ep)) {
        rval = -1;
        goto out;
    }

    /*
     * Get the set timeout information.
     */
    (void) memset(&mhiargs, '\0', sizeof (mhiargs));
    if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
        rval = -1;
        goto out;
    }

    /*
     * Get timestamp and generation id for new records
     */
    now = sd->sd_ctime;
    genid = sd->sd_genid;

    /* At this point, in case of error, set should be flushed. */
    flush_set_onerr = 1;

    rb_level = 1;   /* level 1 */

    for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
        for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
            if (ddp->dd_dnp == midp->mid_dnp) {
                /* same disk */
                ddp->dd_dnp->devid =
                    devid_str_encode(midp->mid_devid,
                    midp->mid_minor_name);

                ddp->dd_dbcnt = 0;
                mirp = midp->mid_replicas;
                if (mirp) {
                    ddp->dd_dbsize = mirp->mir_length;
                    for (; mirp != NULL;
                        mirp = mirp->mir_next) {
                        ddp->dd_dbcnt++;
                    }
                }
                if ((midp->mid_available &
                    MD_IM_DISK_NOT_AVAILABLE) &&
                    (misp->mis_flags & MD_IM_SET_REPLICATED)) {
                    ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
                }
            }
        }
    }

    /*
     * Add the drive records for the drives that we are adding to
     * each host in the set.  Marks the drive records as MD_DR_ADD.
     * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
     * this flag was set in the dd_flags for that drive.
     */
    if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
        goto rollback;

    rb_level = 2;   /* level 2 */

    /*
     * Take ownership of the added drives.
     */
    if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
        goto rollback;

out:
    metafreedrivedesc(&dd);

    if (flush_set_onerr) {
        metaflushsetname(sp);
    }

    return (rval);

rollback:
    /* Make sure we are blocking all signals */
    if (procsigs(TRUE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    rval = -1;

    max_genid = sd->sd_genid;

    /* level 2 */
    if (rb_level > 1) {
        if (!MD_ATSET_DESC(sd)) {
            if (rel_own_bydd(sp, dd, TRUE, &xep)) {
                mdclrerror(&xep);
            }
        }
    }

    /* level 1 */
    if (rb_level > 0) {
        if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
            mdclrerror(&xep);
        }
        max_genid += 2;
        resync_genid(sp, sd, max_genid, 0, NULL);
    }

    /* level 0 */

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    metafreedrivedesc(&dd);

    if (flush_set_onerr) {
        metaflushsetname(sp);
        md_rb_sig_handling_off(md_got_sig(), md_which_sig());
    }

    return (rval);
}

int
meta_set_deletedrives(
    mdsetname_t     *sp,
    mddrivenamelist_t   *dnlp,
    int         forceflg,
    md_error_t      *ep
)
{
    md_set_desc     *sd;
    md_drive_desc       *ddp, *dd = NULL, *curdd = NULL;
    md_replicalist_t    *rlp = NULL, *rl;
    mddrivenamelist_t   *p;
    int         deldrvcnt = 0;
    int         rval = 0;
    mhd_mhiargs_t       mhiargs;
    int         i;
    sigset_t        oldsigs;
    md_setkey_t     *cl_sk;
    ulong_t         max_genid = 0;
    int         rb_level = 0;
    md_error_t      xep = mdnullerror;
    md_mnnode_desc      *nd;
    int         has_set;
    int         current_drv_cnt = 0;
    int         suspendall_flag = 0, suspendall_flag_rb = 0;
    int         suspend1_flag = 0;
    int         lock_flag = 0;
    bool_t          stale_bool = FALSE;
    int         flush_set_onerr = 0;
    mdnamelist_t        *nlp;
    mdname_t        *np;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    /* Make sure we own the set */
    if (meta_check_ownership(sp, ep) != 0)
        return (-1);

    if (drvsuniq(sp, dnlp, ep) == -1)
        return (-1);

    /*
     * Check and see if all the nodes have the set.
     *
     * The drive and node records are stored in the local mddbs of each
     * node in the diskset.  Each node's rpc.metad daemon reads in the set,
     * drive and node records from that node's local mddb and caches them
     * internally. Any process needing diskset information contacts its
     * local rpc.metad to get this information.  Since each node in the
     * diskset is independently reading the set information from its local
     * mddb, the set, drive and node records in the local mddbs must stay
     * in-sync, so that all nodes have a consistent view of the diskset.
     *
     * For a multinode diskset, explicitly verify that all nodes in the
     * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
     * fail this operation since all nodes must be ALIVE in order to delete
     * a drive record from their local mddb.  If a panic of this node
     * leaves the local mddbs set, node and drive records out-of-sync, the
     * reconfig cycle will fix the local mddbs and force them back into
     * synchronization.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
                    sp->setno,
                    nd->nd_nodename, NULL, sp->setname);
                return (-1);
            }
            nd = nd->nd_next;
        }

        /* Make sure we are blocking all signals */
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);

        /*
         * Lock the set on current set members.
         * Set locking done much earlier for MN diskset than for
         * traditional diskset since lock_set and SUSPEND are used
         * to protect against other meta* commands running on the
         * other nodes.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
                rval = -1;
                goto out;
            }
            lock_flag = 1;
            nd = nd->nd_next;
        }
        /*
         * Lock out other meta* commands by suspending
         * class 1 messages across the diskset.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename,
                COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
                MD_MSCF_NO_FLAGS, ep)) {
                rval = -1;
                goto out;
            }
            suspend1_flag = 1;
            nd = nd->nd_next;
        }

        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (strcmp(nd->nd_nodename, mynode()) == 0) {
                nd = nd->nd_next;
                continue;
            }

            has_set = nodehasset(sp, nd->nd_nodename,
                    NHS_NSTG_EQ, ep);
            if (has_set < 0) {
                rval = -1;
                goto out;
            }

            if (! has_set) {
                (void) mddserror(ep, MDE_DS_NODENOSET,
                    sp->setno, nd->nd_nodename,
                    NULL, sp->setname);
                rval = -1;
                goto out;
            }
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (strcmp(sd->sd_nodes[i], mynode()) == 0)
                continue;

            has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
                ep);
            if (has_set < 0) {
                /*
                 * Can directly return since !MN diskset;
                 * nothing to unlock.
                 */
                return (-1);
            }

            if (! has_set) {
                /*
                 * Can directly return since !MN diskset;
                 * nothing to unlock.
                 */
                return (mddserror(ep, MDE_DS_NODENOSET,
                    sp->setno, sd->sd_nodes[i], NULL,
                    sp->setname));
            }
        }
    }

    for (p = dnlp; p != NULL; p = p->next) {
        int     is_it;
        mddrivename_t   *dnp;

        dnp = p->drivenamep;

        if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
            == -1) {
            rval = -1;
            goto out;
        }

        if (! is_it) {
            (void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
                NULL, dnp->cname, sp->setname);
            rval = -1;
            goto out;
        }

        if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
            rval = -1;
            goto out;
        }

        deldrvcnt++;
    }
    current_drv_cnt = deldrvcnt;

    /*
     * Get drive descriptors for the drives that are currently in the set.
     */
    curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
    if (! mdisok(ep)) {
        rval = -1;
        goto out;
    }

    /*
     * Decrement the the delete drive count for each drive currently in the
     * set.
     */
    for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
        deldrvcnt--;

    /*
     * If the count of drives we are deleting is equal to the drives in the
     * set, and we haven't specified forceflg, return an error
     */
    if (deldrvcnt == 0 && forceflg == FALSE) {
        (void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
        rval = -1;
        goto out;
    }

    /*
     * Get the list of drive descriptors that we are deleting.
     */
    dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
    if (! mdisok(ep)) {
        rval = -1;
        goto out;
    }

    /*
     * Get the set timeout information in case we have to roll back.
     */
    (void) memset(&mhiargs, '\0', sizeof (mhiargs));
    if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
        rval = -1;
        goto out;
    }

    /* At this point, in case of error, set should be flushed. */
    flush_set_onerr = 1;

    /* END CHECK CODE */

    /* Lock the set on current set members */
    if (!(MD_MNSET_DESC(sd))) {
        md_rb_sig_handling_on();
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
                rval = -1;
                goto out;
            }
            lock_flag = 1;
        }
    }

    if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
        mddb_config_t       c;
        /*
         * Is current set STALE?
         */
        (void) memset(&c, 0, sizeof (c));
        c.c_id = 0;
        c.c_setno = sp->setno;
        if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
            (void) mdstealerror(ep, &c.c_mde);
            rval = -1;
            goto out;
        }
        if (c.c_flags & MDDB_C_STALE) {
            stale_bool = TRUE;
        }
    }

    RB_TEST(1, "deletedrives", ep)

    RB_PREEMPT;
    rb_level = 1;   /* level 1 */

    RB_TEST(2, "deletedrives", ep)

    /*
     * Mark the drives MD_DR_DEL
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
                MD_DR_DEL, ep) == -1)
                goto rollback;

            RB_TEST(3, "deletedrives", ep)
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
                MD_DR_DEL, ep) == -1)
                goto rollback;

            RB_TEST(3, "deletedrives", ep)
        }
    }

    RB_TEST(4, "deletedrives", ep)

    RB_PREEMPT;
    rb_level = 2;   /* level 2 */

    RB_TEST(5, "deletedrives", ep)

    /*
     * Balance the DB's according to the list of existing drives and the
     * list of deleted drives.
     */
    if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
        goto rollback;

    /*
     * If the drive(s) to be deleted cannot be accessed,
     * they haven't really been deleted yet. Check and delete now
     * if need be.
     */
    if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
        nlp = NULL;
        for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
            char    *delete_name;

            delete_name = ddp->dd_dnp->cname;

            for (rl = rlp; rl != NULL; rl = rl->rl_next) {
                char    *cur_name;

                cur_name =
                    rl->rl_repp->r_namep->drivenamep->cname;

                if (strcmp(delete_name, cur_name) == 0) {
                    /* put it on the delete list */
                    np = rl->rl_repp->r_namep;
                    (void) metanamelist_append(&nlp, np);

                }
            }
        }

        if (nlp != NULL) {
            if (meta_db_detach(sp, nlp,
                (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
                ep) == -1) {
                metafreenamelist(nlp);
                goto rollback;
            }
            metafreenamelist(nlp);
        }
    }

    RB_TEST(6, "deletedrives", ep)

    RB_PREEMPT;
    rb_level = 3;   /* level 3 */

    RB_TEST(7, "deletedrives", ep)

    /*
     * Cannot suspend set until after meta_db_balance since
     * meta_db_balance uses META_DB_ATTACH/DETACH messages.
     */
    if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
        /*
         * Notify rpc.mdcommd on all nodes of a nodelist change.
         * Start by suspending rpc.mdcommd (which drains it of all
         * messages), then change the nodelist followed by a reinit
         * and resume.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
                rval = -1;
                goto out;
            }
            suspendall_flag = 1;
            nd = nd->nd_next;
        }
    }

    /*
     * Remove the drive records for the drives that were deleted from
     * each host in the set.  This removes the record and dr_flags.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
                goto rollback;

            RB_TEST(8, "deletedrives", ep)
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
                goto rollback;

            RB_TEST(8, "deletedrives", ep)
        }
    }

    RB_TEST(9, "deletedrives", ep)

    RB_PREEMPT;
    rb_level = 4;   /* level 4 */

    RB_TEST(10, "deletedrives", ep)

    if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
        if (rel_own_bydd(sp, dd, TRUE, ep))
            goto rollback;
    }

    /* If we deleted all the drives, then we need to halt the set. */
    if (deldrvcnt == 0) {
        RB_TEST(11, "deletedrives", ep)

        RB_PREEMPT;
        rb_level = 5;   /* level 5 */

        RB_TEST(12, "deletedrives", ep)

        if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
            goto rollback;

        RB_TEST(13, "deletedrives", ep)

        RB_PREEMPT;
        rb_level = 6;   /* level 6 */

        RB_TEST(14, "deletedrives", ep)

        /* Halt MN diskset on all nodes by having node withdraw */
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                /* Only withdraw nodes that are joined */
                if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
                    nd = nd->nd_next;
                    continue;
                }
                /*
                 * Going to set locally cached node flags to
                 * rollback join so in case of error, the
                 * rollback code knows which nodes to re-join.
                 */
                nd->nd_flags |= MD_MN_NODE_RB_JOIN;

                /*
                 * Be careful in ordering of following steps
                 * so that recovery from a panic between
                 * the steps is viable.
                 * Only reset master info in rpc.metad -
                 * don't reset local cached information
                 * which will be used to set master information
                 * back in case of failure (rollback).
                 */
                if (clnt_withdrawset(nd->nd_nodename, sp, ep))
                    goto rollback;
                /* Sets withdraw flag on all nodes in list */
                if (clnt_upd_nr_flags(nd->nd_nodename, sp,
                    sd->sd_nodelist, MD_NR_WITHDRAW,
                    NULL, ep)) {
                    goto rollback;
                }
                if (clnt_mnsetmaster(nd->nd_nodename, sp,
                    "", MD_MN_INVALID_NID, ep)) {
                    goto rollback;
                }
                nd = nd->nd_next;
            }
        } else {
            if (halt_set(sp, ep))
                goto rollback;
        }

        RB_TEST(15, "deletedrives", ep)
    }

    RB_TEST(16, "deletedrives", ep)

out:
    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * Send reinit command to mdcommd which forces it to get
     * fresh set description.
     */
    if (suspendall_flag) {
        /* Send reinit */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /* Class is ignored for REINIT */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
                sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to reinit rpc.mdcommd.\n"));
            }
            nd = nd->nd_next;
        }
    }

    /*
     * Just resume all classes so that resume is the same whether
     * just one class was locked or all classes were locked.
     */
    if ((suspend1_flag) || (suspendall_flag)) {
        /* Send resume */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }
    if (lock_flag) {
        cl_sk = cl_get_setkey(sp->setno, sp->setname);
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                if (clnt_unlock_set(nd->nd_nodename,
                    cl_sk, &xep)) {
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
                nd = nd->nd_next;
            }
        } else {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_unlock_set(sd->sd_nodes[i],
                    cl_sk, &xep)) {
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
            }
        }
        cl_set_setkey(NULL);
    }

    metafreedrivedesc(&dd);

    if (flush_set_onerr) {
        metaflushsetname(sp);
        if (!(MD_MNSET_DESC(sd))) {
            md_rb_sig_handling_off(md_got_sig(), md_which_sig());
        }
    }

    if (MD_MNSET_DESC(sd)) {
        /* release signals back to what they were on entry */
        if (procsigs(FALSE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    }

    return (rval);

rollback:
    /* all signals already blocked for MN disket */
    if (!(MD_MNSET_DESC(sd))) {
        /* Make sure we are blocking all signals */
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    }

    rval = -1;

    max_genid = sd->sd_genid;

    /* Set the master on all nodes first thing */
    if (rb_level > 5) {
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
                    continue;
                }
                /*
                 * Set master on all re-joining nodes to be
                 * my cached view of master.
                 */
                if (clnt_mnsetmaster(nd->nd_nodename, sp,
                    sd->sd_mn_master_nodenm,
                    sd->sd_mn_master_nodeid, &xep)) {
                    mdclrerror(&xep);
                }
            }
        }
    }

    /* level 3 */
    if (rb_level > 2) {
        md_set_record       *sr;
        md_mnset_record     *mnsr;
        md_drive_record     *dr;
        int         sr_drive_cnt;

        /*
         * See if we have to re-add the drives specified.
         */
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                /*
                 * Must get current set record from each
                 * node to see what else must be done
                 * to recover.
                 * Record should be for a multi-node diskset.
                 */
                if (clnt_mngetset(nd->nd_nodename, sp->setname,
                    MD_SET_BAD, &mnsr, &xep) == -1) {
                    mdclrerror(&xep);
                    nd = nd->nd_next;
                    continue;
                }

                /*
                 * If all drives are already there, skip
                 * to next node.
                 */
                sr_drive_cnt = 0;
                dr = mnsr->sr_drivechain;
                while (dr) {
                    sr_drive_cnt++;
                    dr = dr->dr_next;
                }
                if (sr_drive_cnt == current_drv_cnt) {
                    free_sr((md_set_record *)mnsr);
                    nd = nd->nd_next;
                    continue;
                }

                /* Readd all drives */
                if (clnt_adddrvs(nd->nd_nodename, sp, dd,
                    mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
                    mdclrerror(&xep);

                free_sr((struct md_set_record *)mnsr);
                nd = nd->nd_next;
            }
        } else {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                /* Record should be for a non-multi-node set */
                if (clnt_getset(sd->sd_nodes[i], sp->setname,
                    MD_SET_BAD, &sr, &xep) == -1) {
                    mdclrerror(&xep);
                    continue;
                }

                /*
                 * Set record structure was allocated from RPC
                 * routine getset so this structure is only of
                 * size md_set_record even if the MN flag is
                 * set.  So, clear the flag so that the free
                 * code doesn't attempt to free a structure
                 * the size of md_mnset_record.
                 */
                if (MD_MNSET_REC(sr)) {
                    sr->sr_flags &= ~MD_SR_MN;
                    free_sr(sr);
                    continue;
                }

                /* Drive already added, skip to next node */
                if (sr->sr_drivechain != NULL) {
                    free_sr(sr);
                    continue;
                }

                if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
                    sr->sr_ctime, sr->sr_genid, &xep) == -1)
                    mdclrerror(&xep);

                free_sr(sr);
            }
        }
        max_genid += 2;
    }

    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * At this point in time, don't know which nodes are joined
     * to the set.  So, send a reinit command to mdcommd
     * which forces it to get fresh set description.  Then send resume.
     *
     * Later, this code will use rpc.mdcommd messages to reattach disks
     * and then rpc.mdcommd may be suspended again, rest of the nodes
     * joined, rpc.mdcommd reinited and then resumed.
     */
    if (suspendall_flag) {
        /* Send reinit */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /* Class is ignored for REINIT */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
                sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
                mde_perror(&xep, dgettext(TEXT_DOMAIN,
                    "Unable to reinit rpc.mdcommd.\n"));
                mdclrerror(&xep);
            }
            nd = nd->nd_next;
        }

        /* Send resume */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /*
             * Resume all classes but class 1 so that lock is held
             * against meta* commands.
             * To later resume class1, must issue a class0 resume.
             */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0,
                MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
                mde_perror(&xep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
                mdclrerror(&xep);
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }

    /* level 2 */
    if (rb_level > 1) {
        mdnamelist_t    *nlp;
        mdname_t    *np;

        for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
            uint_t  rep_slice;

            if ((meta_replicaslice(ddp->dd_dnp,
                &rep_slice, &xep) != 0) ||
                ((np = metaslicename(ddp->dd_dnp, rep_slice,
                &xep)) == NULL)) {
                mdclrerror(&xep);
                continue;
            }
            nlp = NULL;
            (void) metanamelist_append(&nlp, np);

            if (meta_db_attach(sp, nlp,
                (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
                &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
                NULL, &xep) == -1)
                mdclrerror(&xep);

            metafreenamelist(nlp);
        }
        /* Re-balance */
        if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
            mdclrerror(&xep);
    }

    /* level 4 */
    if (rb_level > 3) {
        if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
            if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
                mdclrerror(&xep);
        }
    }

    /* level 5 */
    if (rb_level > 4) {
        if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
            mdclrerror(&xep);
    }

    /*
     * If at least one node needs to be rejoined to MN diskset,
     * then suspend commd again.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
                nd = nd->nd_next;
                continue;
            }
            break;
        }
        if (nd) {
            /*
             * Found node that will be rejoined so
             * notify rpc.mdcommd on all nodes of a nodelist change.
             * Start by suspending rpc.mdcommd (which drains it of
             * all messages), then change the nodelist followed by
             * a reinit and resume.
             */
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                if (clnt_mdcommdctl(nd->nd_nodename,
                    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
                    MD_MSCF_NO_FLAGS, &xep)) {
                    mdclrerror(&xep);
                }
                suspendall_flag_rb = 1;
                nd = nd->nd_next;
            }
        }
    }


    /* level 6 */
    if (rb_level > 5) {
        if (MD_MNSET_DESC(sd)) {
            int join_flags = 0;

            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                /* Only rejoin nodes that were joined before */
                if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
                    nd = nd->nd_next;
                    continue;
                }
                /*
                 * Rejoin nodes to same state as before -
                 * either STALE or non-STALE.
                 */
                if (stale_bool == TRUE)
                    join_flags = MNSET_IS_STALE;
                if (clnt_joinset(nd->nd_nodename, sp,
                    join_flags, &xep))
                    mdclrerror(&xep);
                /* Sets OWN flag on all nodes in list */
                if (clnt_upd_nr_flags(nd->nd_nodename, sp,
                    sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
                    mdclrerror(&xep);
                }
                nd = nd->nd_next;
            }
        } else {
            if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
                mdclrerror(&xep);

            /* No special flag for traditional diskset */
            if (snarf_set(sp, NULL, &xep))
                mdclrerror(&xep);
        }
    }

    /* level 1 */
    if (rb_level > 0) {
        /*
         * Mark the drives as OK.
         */
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            /* All nodes are guaranteed to be ALIVE */
            while (nd) {
                /*
                 * Must be last action before unlock.
                 * In case of panic, recovery code checks
                 * for MD_DR_OK to know that drive
                 * and possible master are fully added back.
                 */
                if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
                    MD_DR_OK, &xep) == -1)
                    mdclrerror(&xep);
                nd = nd->nd_next;
            }
        } else {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
                    MD_DR_OK, &xep) == -1)
                    mdclrerror(&xep);

            }
        }
        max_genid += 2;
        resync_genid(sp, sd, max_genid, 0, NULL);
    }
    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * Send a reinit command to mdcommd which forces it to get
     * fresh set description.
     */
    if (suspendall_flag_rb) {
        /* Send reinit */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            /* Class is ignored for REINIT */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
                sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
                mde_perror(&xep, dgettext(TEXT_DOMAIN,
                    "Unable to reinit rpc.mdcommd.\n"));
                mdclrerror(&xep);
            }
            nd = nd->nd_next;
        }
    }

    /*
     * Just resume all classes so that resume is the same whether
     * just one class was locked or all classes were locked.
     */
    if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
        /* Send resume */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                mde_perror(&xep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
                mdclrerror(&xep);
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }


    /* level 0 */
    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    /* Don't test lock flag since guaranteed to be set if in rollback */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
                mdclrerror(&xep);
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
                mdclrerror(&xep);
        }
    }
    cl_set_setkey(NULL);

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    metafreedrivedesc(&dd);

    if (flush_set_onerr) {
        metaflushsetname(sp);
        if (!(MD_MNSET_DESC(sd))) {
            md_rb_sig_handling_off(md_got_sig(), md_which_sig());
        }
    }

    return (rval);
}