libmeta/common/meta_set.c

	meta_set.c revision bf85a12b7c81d0745d5a8aff65baeff50006cde9
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Just in case we're not in a build environment, make sure that
 * TEXT_DOMAIN gets set to something.
 */
#if !defined(TEXT_DOMAIN)
#define TEXT_DOMAIN "SYS_TEST"
#endif

/*
 * Metadevice diskset interfaces
 */

#include "meta_set_prv.h"
#include <meta.h>
#include <metad.h>
#include <mdmn_changelog.h>
#include <sys/lvm/md_crc.h>
#include <sys/utsname.h>
#include <sdssc.h>

#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>
extern  char    *blkname(char *);

static md_drive_desc *
dr2drivedesc(
    mdsetname_t *sp,
    side_t      sideno,
    int     flags,
    md_error_t  *ep
)
{
    md_set_record   *sr;
    md_drive_record *dr;
    mddrivename_t   *dnp;
    md_drive_desc   *dd_head = NULL;
    md_set_desc *sd;

    if (flags & MD_BYPASS_DAEMON) {
        if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
            return (NULL);
        sd = metaget_setdesc(sp, ep);
        sideno = getnodeside(mynode(), sd);
        sp = metafakesetname(sp->setno, sr->sr_setname);
    } else {
        if ((sr = getsetbyname(sp->setname, ep)) == NULL)
            return (NULL);
    }

    assert(sideno != MD_SIDEWILD);

    /*
     * WARNING:
     * The act of getting the dnp from the namespace means that we
     * will get the devid of the disk as recorded in the namespace.
     * This devid has the potential to be stale if the disk is being
     * replaced via a rebind, this means that any code that relies
     * on any of the dnp information should take the appropriate action
     * to preserve that information. For example in the rebind code the
     * devid of the new disk is saved off and then copied back in once
     * the code that has called this function has completed.
     */
    for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
        if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
            flags, ep)) == NULL) {
            if (!(flags & MD_BYPASS_DAEMON))
                free_sr(sr);
            metafreedrivedesc(&dd_head);
            return (NULL);
        }

        (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
            dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
    }

    if (!(flags & MD_BYPASS_DAEMON)) {
        free_sr(sr);
    }
    return (dd_head);
}

static int
get_sidenmlist(
    mdsetname_t *sp,
    mddrivename_t   *dnp,
    md_error_t  *ep
)
{
    md_set_desc *sd;
    mdsidenames_t   *sn, **sn_next;
    int     i;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    metaflushsidenames(dnp);
    sn_next = &dnp->side_names;
    if (MD_MNSET_DESC(sd)) {
        /*
         * Only get sidenames for this node since
         * that is the only side information stored in
         * the local mddb for a multi-node diskset.
         */
        if (sd->sd_mn_mynode) {
            sn = Zalloc(sizeof (*sn));
            sn->sideno = sd->sd_mn_mynode->nd_nodeid;
            if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
                sn->sideno, dnp->side_names_key, &sn->dname,
                &sn->mnum, NULL, ep)) == NULL) {
                if (sn->dname != NULL)
                    Free(sn->dname);
                Free(sn);
                return (-1);
            }

            /* Add to the end of the linked list */
            assert(*sn_next == NULL);
            *sn_next = sn;
            sn_next = &sn->next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            sn = Zalloc(sizeof (*sn));
            sn->sideno = i;
            if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
                i+SKEW, dnp->side_names_key, &sn->dname,
                &sn->mnum, NULL, ep)) == NULL) {
                /*
                 * It is possible that during the add of a
                 * host to have a 'missing' side as the side
                 * for this disk will be added later. So ignore
                 * the error. The 'missing' side will be added
                 * once the addhosts process has completed.
                 */
                if (mdissyserror(ep, ENOENT)) {
                    mdclrerror(ep);
                    Free(sn);
                    continue;
                }

                if (sn->dname != NULL)
                    Free(sn->dname);
                Free(sn);
                return (-1);
            }

            /* Add to the end of the linked list */
            assert(*sn_next == NULL);
            *sn_next = sn;
            sn_next = &sn->next;
        }
    }

    return (0);
}

static md_drive_desc *
rl_to_dd(
    mdsetname_t     *sp,
    md_replicalist_t    *rlp,
    md_error_t      *ep
)
{
    md_replicalist_t    *rl;
    md_replica_t        *r;
    md_drive_desc       *dd = NULL;
    md_drive_desc       *d;
    int         found;
    md_set_desc     *sd;
    daddr_t         nblks = 0;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (NULL);

    /* find the smallest existing replica */
    for (rl = rlp; rl != NULL; rl = rl->rl_next) {
        r = rl->rl_repp;
        nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
    }

    if (nblks <= 0)
        nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;

    for (rl = rlp; rl != NULL; rl = rl->rl_next) {
        r = rl->rl_repp;

        found = 0;
        for (d = dd; d != NULL; d = d->dd_next) {
            if (strcmp(r->r_namep->drivenamep->cname,
                d->dd_dnp->cname) == 0) {
                found = 1;
                dd->dd_dbcnt++;
                break;
            }
        }

        if (! found)
            (void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
                1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
    }

    return (dd);
}

/*
 * Exported Entry Points
 */

set_t
get_max_sets(md_error_t *ep)
{

    static set_t        max_sets = 0;

    if (max_sets == 0)
        if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
            return (0);

    return (max_sets);
}

int
get_max_meds(md_error_t *ep)
{
    static int      max_meds = 0;

    if (max_meds == 0)
        if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
            return (0);

    return (max_meds);
}

side_t
getmyside(mdsetname_t *sp, md_error_t *ep)
{
    md_set_desc     *sd;
    char            *node = NULL;
    side_t          sideno;

    if (sp->setno == 0)
        return (0);

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (MD_SIDEWILD);

    node = mynode();

    assert(node != NULL);

    sideno = getnodeside(node, sd);

    if (sideno != MD_SIDEWILD)
        return (sideno);

    return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
}

/*
 * get set info from name
 */
md_set_record *
getsetbyname(char *setname, md_error_t *ep)
{
    md_set_record       *sr = NULL;
    md_mnset_record     *mnsr = NULL;
    char            *p;
    size_t          len;

    /* get set info from daemon */
    if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
        return (NULL);
    if (sr != NULL) {
        /*
         * Returned record could be for a multi-node set or a
         * non-multi-node set.
         */
        if (MD_MNSET_REC(sr)) {
            /*
             * Record is for a multi-node set.  Reissue call
             * to get mnset information.  Need to free
             * record as if a non-multi-node set record since
             * that is what clnt_getset gave us.  If in
             * the daemon, don't free since this is a pointer
             * into the setrecords array.
             */
            if (! md_in_daemon) {
                sr->sr_flags &= ~MD_SR_MN;
                free_sr(sr);
            }
            if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
                ep) == -1)
                return (NULL);
            if (mnsr != NULL)
                return ((struct md_set_record *)mnsr);
        } else {
            return (sr);
        }
    }

    /* no such set */
    len = strlen(setname) + 30;
    p = Malloc(len);
    (void) snprintf(p, len, "setname \"%s\"", setname);
    (void) mderror(ep, MDE_NO_SET, p);
    Free(p);
    return (NULL);
}

/*
 * get set info from number
 */
md_set_record *
getsetbynum(set_t setno, md_error_t *ep)
{
    md_set_record       *sr;
    md_mnset_record     *mnsr = NULL;
    char            buf[100];

    if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
        return (NULL);

    if (sr != NULL) {
        /*
         * Record is for a multi-node set.  Reissue call
         * to get mnset information.  Need to free
         * record as if a non-multi-node set record since
         * that is what clnt_getset gave us.  If in
         * the daemon, don't free since this is a pointer
         * into the setrecords array.
         */
        if (MD_MNSET_REC(sr)) {
            /*
             * Record is for a multi-node set.  Reissue call
             * to get mnset information.
             */
            if (! md_in_daemon) {
                sr->sr_flags &= ~MD_SR_MN;
                free_sr(sr);
            }
            if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
                ep) == -1)
                return (NULL);
            if (mnsr != NULL)
                return ((struct md_set_record *)mnsr);
        } else {
            return (sr);
        }
    }

    (void) sprintf(buf, "setno %u", setno);
    (void) mderror(ep, MDE_NO_SET, buf);
    return (NULL);
}

int
meta_check_drive_inuse(
    mdsetname_t *sp,
    mddrivename_t   *dnp,
    int     check_db,
    md_error_t  *ep
)
{
    mdnamelist_t    *nlp = NULL;
    mdnamelist_t    *p;
    int     rval = 0;

    /* get all underlying partitions */
    if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
        return (-1);

    /* search for drive */
    for (p = nlp; (p != NULL); p = p->next) {
        mdname_t    *np = p->namep;

        if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
            rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
                NULL, dnp->cname, sp->setname));
            break;
        }
    }

    /* cleanup, return success */
    metafreenamelist(nlp);
    return (rval);
}

/*
 * simple check for ownership
 */
int
meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
{
    int         ownset;
    md_set_desc     *sd;
    md_drive_desc       *dd;
    md_replicalist_t    *rlp = NULL;
    md_error_t      xep = mdnullerror;

    if (metaislocalset(sp))
        return (0);

    ownset = own_set(sp, NULL, TRUE, ep);
    if (! mdisok(ep))
        return (-1);

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
    if (! mdisok(ep))
        return (-1);

    /* If we have no drive descriptors, check for no ownership */
    if (dd == NULL) {
        if (ownset == MD_SETOWNER_NONE)
            return (0);

        /* If ownership somehow has come to exist, we must clean up */

        if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
            &xep) < 0)
            mdclrerror(&xep);

        if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
            if (! mdisok(&xep))
                mdclrerror(&xep);

        if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
            if (rel_own_bydd(sp, dd, TRUE, &xep))
                mdclrerror(&xep);
        }

        if (halt_set(sp, &xep))
            mdclrerror(&xep);

        metafreereplicalist(rlp);

        metafreedrivedesc(&dd);

        return (0);
    }

    metafreedrivedesc(&sd->sd_drvs);

    if (ownset == MD_SETOWNER_YES)
        return (0);

    return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
        sp->setname));
}

/*
 * simple check for ownership
 */
int
meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
{
    md_set_desc *sd;
    md_drive_desc   *dd;
    int     bool;

    if (metaislocalset(sp))
        return (0);

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    if (getnodeside(hostname, sd) == MD_SIDEWILD)
        return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
            hostname, NULL, sp->setname));

    dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
    if (! mdisok(ep))
        return (-1);

    if (clnt_ownset(hostname, sp, &bool, ep) == -1)
        return (-1);

    if (dd == NULL)
        return (0);

    metafreedrivedesc(&sd->sd_drvs);

    if (bool == TRUE)
        return (0);

    return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
        sp->setname));
}

/*
 * Function that determines if a node is in the multinode diskset
 * membership list.  Calling node passes in node to be checked and
 * the nodelist as returned from meta_read_nodelist.  This routine
 * anticipates being called many times using the same diskset membership
 * list which is why the alloc and free of the diskset membership list
 * is left to the calling routine.
 * Returns:
 *  1 - if a member
 *  0 - not a member
 */
int
meta_is_member(
    char                *node_name,
    md_mn_nodeid_t          node_id,
    mndiskset_membershiplist_t  *nl
)
{
    mndiskset_membershiplist_t  *nl2;
    int             flag_check_name;

    if (node_id != 0)
        flag_check_name = 0;
    else if (node_name != NULL)
        flag_check_name = 1;
    else
        return (0);

    nl2 = nl;
    while (nl2) {
        if (flag_check_name) {
            /* Compare given name against name in member list */
            if (strcmp(nl2->msl_node_name, node_name) == 0)
                break;
        } else {
            /* Compare given nodeid against nodeid in member list */
            if (nl2->msl_node_id == node_id)
                break;
        }
        nl2 = nl2->next;
    }
    /* No match found in member list */
    if (nl2 == NULL) {
        return (0);
    }
    /* Return 1 if node is in member list */
    return (1);
}

/*
 * meta_getnext_devinfo should go to the host that
 * has the device, to return the device name, driver name, minor num.
 * We can take the big cheat for now, since it is a requirement
 * that the device names and device numbers are the same, and
 * just get the info locally.
 *
 * This routine is very similar to meta_getnextside_devinfo except
 * that the specific side to be used is being passed in.
 *
 * Exit status:
 *   0 - No more side info to return
 *   1 - More side info's to return
 *  -1 - An error has been detected
 */
/*ARGSUSED*/
int
meta_getside_devinfo(
    mdsetname_t *sp,        /* for this set */
    char        *bname,     /* local block name (myside) */
    side_t      sideno,     /* sideno */
    char        **ret_bname,    /* block device name of returned side */
    char        **ret_dname,    /* driver name of returned side */
    minor_t     *ret_mnum,  /* minor number of returned side */
    md_error_t  *ep
)
{
    mdname_t    *np;

    if (ret_bname != NULL)
        *ret_bname = NULL;
    if (ret_dname != NULL)
        *ret_dname = NULL;
    if (ret_mnum != NULL)
        *ret_mnum = NODEV32;


    if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
        return (-1);

/*
 * NOTE (future) - There will be more work here once devids are integrated
 * into disksets.  Then the side should be used to find the correct
 * host and the b/d names should be gotten from that host.
 */

    /*
     * Return the side info.
     */
    if (ret_bname != NULL)
        *ret_bname = Strdup(np->bname);

    if (ret_dname != NULL) {
        mdcinfo_t   *cinfo;

        if ((cinfo = metagetcinfo(np, ep)) == NULL)
            return (-1);

        *ret_dname = Strdup(cinfo->dname);
    }

    if (ret_mnum != NULL)
        *ret_mnum = meta_getminor(np->dev);

    return (1);
}

/*
 * Get the information on the device from the remote node using the devid
 * of the disk.
 *
 * Exit status:
 *   0 - No more side info to return
 *   1 - More side info's to return
 *  -1 - An error has been detected
 */
int
meta_getnextside_devinfo(
    mdsetname_t *sp,        /* for this set */
    char        *bname,     /* local block name (myside) */
    side_t      *sideno,    /* previous sideno & returned sideno */
    char        **ret_bname,    /* block device name of returned side */
    char        **ret_dname,    /* driver name of returned side */
    minor_t     *ret_mnum,  /* minor number of returned side */
    md_error_t  *ep
)
{
    md_set_desc *sd;
    int     i;
    mdname_t    *np;
    mddrivename_t   *dnp;
    char        *devidstr = NULL;
    int     devidstrlen;
    md_dev64_t  retdev = NODEV64;
    char        *ret_devname = NULL;
    char        *ret_blkdevname = NULL;
    char        *ret_driver = NULL;
    char        *nodename;
    int     fd;
    int     ret = -1;
    char        *minor_name = NULL;
    md_mnnode_desc  *nd;


    if (ret_bname != NULL)
        *ret_bname = NULL;
    if (ret_dname != NULL)
        *ret_dname = NULL;
    if (ret_mnum != NULL)
        *ret_mnum = NODEV32;

    if (metaislocalset(sp)) {
        /* no more sides - we are done */
        if (*sideno != MD_SIDEWILD)
            return (0);

        /* First time through -  set up return sideno */
        *sideno = 0;
    } else {

        /*
         * Find the next sideno, starting after the one given.
         */
        if ((sd = metaget_setdesc(sp, ep)) == NULL)
            return (-1);

        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            if ((*sideno == MD_SIDEWILD) &&
                (nd != (struct md_mnnode_desc *)NULL)) {
                *sideno = nd->nd_nodeid;
            } else {
                while (nd) {
                    /*
                     * Found given sideno, now find
                     * next sideno, if there is one.
                     */
                    if ((*sideno == nd->nd_nodeid) &&
                        (nd->nd_next !=
                        (struct md_mnnode_desc *)NULL)) {
                        *sideno =
                            nd->nd_next->nd_nodeid;
                        break;
                    }
                    nd = nd->nd_next;
                }
                if (nd == NULL) {
                    return (0);
                }
            }
            if (*sideno == MD_SIDEWILD)
                return (0);
        } else {
            for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
                /* Find next full slot */
                if (sd->sd_nodes[i][0] != '\0')
                    break;

            /* No more sides - we are done */
            if (i == MD_MAXSIDES)
                return (0);

            /* Set up the return sideno */
            *sideno = i;
            nodename = (char *)sd->sd_nodes[i];
        }
    }

    /*
     * Need to pass the node the devid of the disk and get it to
     * send back the details of the disk from that side.
     */
    if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
        return (-1);

    dnp = np->drivenamep;

    /*
     * By default, set up the parameters so that they are copied out.
     */
    if (ret_bname != NULL)
        *ret_bname = Strdup(np->bname);

    if (ret_dname != NULL) {
        mdcinfo_t   *cinfo;

        if ((cinfo = metagetcinfo(np, ep)) == NULL)
            return (-1);

        *ret_dname = Strdup(cinfo->dname);
    }

    if (ret_mnum != NULL)
        *ret_mnum = meta_getminor(np->dev);

    /*
     * Try some optimization. If this is the local set or the device
     * is a metadevice then just copy the information. If the device
     * does not have a devid (due to not having a minor name) then
     * fall back to the pre-devid behaviour of copying the information
     * on the device: this is okay because the sanity checks before this
     * call would have found any issues with the device. If it's a
     * multi-node diskset also just return ie. copy.
     */
    if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
        (MD_MNSET_DESC(sd)))
        return (1);

    if (np->minor_name == (char *)NULL) {
        /*
         * Have to get the minor name then. The slice should exist
         * on the disk because it will have already been repartitioned
         * up prior to getting to this point.
         */
        if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
            (void) mdsyserror(ep, errno, np->bname);
            return (-1);
        }
        (void) devid_get_minor_name(fd, &minor_name);
        np->minor_name = Strdup(minor_name);
        devid_str_free(minor_name);
        (void) close(fd);
    }

    /* allocate extra space for "/" and NULL hence +2 */
    devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
    devidstr = (char *)Malloc(devidstrlen);

    /*
     * As a minor name is supplied then the ret_devname will be
     * appropriate to that minor_name and in this case it will be
     * a block device ie /dev/dsk.
     */
    (void) snprintf(devidstr, devidstrlen,
        "%s/%s", dnp->devid, np->minor_name);

    ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
        np->bname, &ret_devname, &ret_driver, ep);

    Free(devidstr);

    /*
     * If the other side is not running device id in disksets,
     * 'ret' is set to ENOTSUP in which case we fallback to
     * the existing behaviour
     */
    if (ret == ENOTSUP)
        return (1);
    else if (ret == -1)
        return (-1);

    /*
     * ret_devname comes from the rpc call and is a
     * raw device name. We need to make this into a
     * block device via blkname for further processing.
     * Unfortunately, when our device id isn't found in
     * the system, the rpc call will return a " " in
     * ret_devname in which case we need to fill that in
     * as ret_blkname because blkname of " " returns NULL.
     */
    if (ret_bname != NULL && ret_devname != NULL) {
        ret_blkdevname = blkname(ret_devname);
        if (ret_blkdevname == NULL)
            *ret_bname = Strdup(ret_devname);
        else
            *ret_bname = Strdup(ret_blkdevname);
    }

    if (ret_dname != NULL && ret_driver != NULL)
        *ret_dname = Strdup(ret_driver);

    if (ret_mnum != NULL)
        *ret_mnum = meta_getminor(retdev);

    return (1);
}

int
meta_is_drive_in_anyset(
    mddrivename_t   *dnp,
    mdsetname_t **spp,
    int     bypass_daemon,
    md_error_t  *ep
)
{
    set_t       setno;
    mdsetname_t *this_sp;
    int     is_it;
    set_t       max_sets;

    if ((max_sets = get_max_sets(ep)) == 0)
        return (-1);

    assert(spp != NULL);
    *spp = NULL;

    for (setno = 1; setno < max_sets; setno++) {
        if (!bypass_daemon) {
            if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
                if (mdismddberror(ep, MDE_DB_NODB)) {
                    mdclrerror(ep);
                    return (0);
                }
                if (mdiserror(ep, MDE_NO_SET)) {
                    mdclrerror(ep);
                    continue;
                }
                return (-1);
            }
        } else
            this_sp = metafakesetname(setno, NULL);

        if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
            bypass_daemon, ep)) == -1) {
            if (mdiserror(ep, MDE_NO_SET)) {
                mdclrerror(ep);
                continue;
            }
            return (-1);
        }
        if (is_it) {
            *spp = this_sp;
            return (0);
        }
    }
    return (0);
}

int
meta_is_drive_in_thisset(
    mdsetname_t *sp,
    mddrivename_t   *dnp,
    int     bypass_daemon,
    md_error_t  *ep
)
{
    md_drive_desc   *dd, *p;

    if (bypass_daemon)
        dd = dr2drivedesc(sp, MD_SIDEWILD,
            (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
    else
        dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);

    if (dd == NULL) {
        if (! mdisok(ep))
            return (-1);
        return (0);
    }


    for (p = dd; p != NULL; p = p->dd_next)
        if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
            return (1);
    return (0);
}

/*
 * Check to see if devid is in use in any diskset.
 * This is used in the case when a partial diskset is being imported
 * to make sure that the unvailable drive isn't already in use in an
 * already imported partial diskset.  Can't check on the cname since the
 * unavailable disk's cname is from the previous system and may collide
 * with a cname on this system.
 * Return values:
 *  1: devid has been found in a diskset
 *  0: devid not found in any diskset
 */
int
meta_is_devid_in_anyset(
    void        *devid,
    mdsetname_t **spp,
    md_error_t  *ep
)
{
    set_t       setno;
    mdsetname_t *this_sp;
    int     is_it;
    set_t       max_sets;

    if ((max_sets = get_max_sets(ep)) == 0)
        return (-1);

    assert(spp != NULL);
    *spp = NULL;

    for (setno = 1; setno < max_sets; setno++) {
        if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
            if (mdismddberror(ep, MDE_DB_NODB)) {
                mdclrerror(ep);
                return (0);
            }
            if (mdiserror(ep, MDE_NO_SET)) {
                mdclrerror(ep);
                continue;
            }
            return (-1);
        }

        if ((is_it = meta_is_devid_in_thisset(this_sp,
            devid, ep)) == -1) {
            if (mdiserror(ep, MDE_NO_SET)) {
                mdclrerror(ep);
                continue;
            }
            return (-1);
        }
        if (is_it) {
            *spp = this_sp;
            return (0);
        }
    }
    return (0);
}

int
meta_is_devid_in_thisset(
    mdsetname_t *sp,
    void        *devid,
    md_error_t  *ep
)
{
    md_drive_desc   *dd, *p;
    ddi_devid_t dd_devid;

    dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
    if (dd == NULL) {
        if (! mdisok(ep))
            return (-1);
        return (0);
    }

    for (p = dd; p != NULL; p = p->dd_next) {
        if (p->dd_dnp->devid == NULL)
            continue;
        (void) devid_str_decode(p->dd_dnp->devid,
            &dd_devid, NULL);
        if (dd_devid == NULL)
            continue;
        if (devid_compare(devid, dd_devid) == 0) {
            devid_free(dd_devid);
            return (1);
        }
        devid_free(dd_devid);
    }
    return (0);
}

int
meta_set_balance(
    mdsetname_t     *sp,
    md_error_t      *ep
)
{
    md_set_desc     *sd;
    md_drive_desc       *dd, *curdd;
    daddr_t         dbsize;
    daddr_t         nblks;
    int         i;
    int         rval = 0;
    sigset_t        oldsigs;
    md_setkey_t     *cl_sk;
    md_error_t      xep = mdnullerror;
    md_mnnode_desc      *nd;
    int         suspend1_flag = 0;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;

    /* Make sure we own the set */
    if (meta_check_ownership(sp, ep) != 0)
        return (-1);

    /* END CHECK CODE */

    /*
     * Get drive descriptors for the drives that are currently in the set.
     */
    curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);

    if (! mdisok(ep))
        return (-1);

    /* Find the minimum replica size in use is or use the default */
    if ((nblks = meta_db_minreplica(sp, ep)) < 0)
        mdclrerror(ep);
    else
        dbsize = nblks; /* adjust replica size */

    /* Make sure we are blocking all signals */
    if (procsigs(TRUE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    /*
     * Lock the set on current set members.
     * For MN diskset lock_set and SUSPEND are used to protect against
     * other meta* commands running on the other nodes.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
                rval = -1;
                goto out;
            }
            nd = nd->nd_next;
        }
        /*
         * Lock out other meta* commands by suspending
         * class 1 messages across the diskset.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_mdcommdctl(nd->nd_nodename,
                COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
                MD_MSCF_NO_FLAGS, ep)) {
                rval = -1;
                goto out;
            }
            suspend1_flag = 1;
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0') continue;

            if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
                rval = -1;
                goto out;
            }
        }
    }

    /* We are not adding or deleting any drives, just balancing */
    dd = NULL;

    /*
     * Balance the DB's according to the list of existing drives and the
     * list of added drives.
     */
    if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
        goto out;

out:
    /*
     * Unlock diskset by resuming class 1 messages across the diskset.
     * Just resume all classes so that resume is the same whether
     * just one class was locked or all classes were locked.
     */
    if (suspend1_flag) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                /*
                 * We are here because we failed to resume
                 * rpc.mdcommd.  However we potentially have
                 * an error from the previous call
                 * (meta_db_balance). If the previous call
                 * did fail,  we capture that error and
                 * generate a perror withthe string,
                 * "Unable to resume...".
                 * Setting rval to -1 ensures that in the
                 * next iteration of the loop, ep is not
                 * clobbered.
                 */
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                else
                    mdclrerror(&xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd."));
            }
            nd = nd->nd_next;
        }
    }

    /* Unlock the set */
    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                else
                    mdclrerror(&xep);
                rval = -1;
            }
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = -1;
            }
        }
    }

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    cl_set_setkey(NULL);

    metaflushsetname(sp);

    return (rval);
}

int
meta_set_destroy(
    mdsetname_t *sp,
    int     lock_set,
    md_error_t  *ep
)
{
    int     i;
    med_rec_t   medr;
    md_set_desc *sd;
    md_drive_desc   *dd, *p, *p1;
    mddrivename_t   *dnp;
    mdname_t    *np;
    mdnamelist_t    *nlp = NULL;
    int     num_users = 0;
    int     has_set;
    side_t      mysideno;
    sigset_t    oldsigs;
    md_error_t  xep = mdnullerror;
    md_setkey_t *cl_sk;
    int     rval = 0;
    int     delete_end = 1;

    /* Make sure we are blocking all signals */
    if (procsigs(TRUE, &oldsigs, ep) < 0)
        return (-1);

    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        if (! mdisok(ep))
            rval = -1;
        goto out;
    }

    /*
     * meta_set_destroy should not be called for a MN diskset.
     * This routine destroys a set without communicating this information
     * to the other nodes which would lead to an inconsistency in
     * the MN diskset.
     */
    if (MD_MNSET_DESC(sd)) {
        rval = -1;
        goto out;
    }

    /* Continue if a traditional diskset */

    /*
     * Check to see who has the set.  If we are not the last user of the
     * set, we will not touch the replicas.
     */
    for (i = 0; i < MD_MAXSIDES; i++) {
        /* Skip empty slots */
        if (sd->sd_nodes[i][0] == '\0')
            continue;

        has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
            ep);

        if (has_set < 0) {
            mdclrerror(ep);
        } else
            num_users++;
    }

    if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
        if (! mdisok(ep)) {
            rval = -1;
            goto out;
        }
    }

    if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
        rval = -1;
        goto out;
    }

    if (lock_set == TRUE) {
        /* Lock the set on our side */
        if (clnt_lock_set(mynode(), sp, ep)) {
            rval = -1;
            goto out;
        }
    }

    /*
     * A traditional diskset has no diskset stale information to send
     * since there can only be one owner node at a time.
     */
    if (snarf_set(sp, FALSE, ep))
        mdclrerror(ep);

    if (dd != NULL) {
        /*
         * Make sure that no drives are in use as parts of metadrives
         * or hot spare pools, this is one of the few error conditions
         * that will stop this routine, unless the environment has
         * META_DESTROY_SET_OK set, in which case, the operation will
         * proceed.
         */
        if (getenv("META_DESTROY_SET_OK") == NULL) {
            for (p = dd; p != NULL; p = p->dd_next) {
                dnp = p->dd_dnp;

                i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
                if (i == -1) {
                    /* need xep - wire calls clear error */
                    i = metaget_setownership(sp, &xep);
                    if (i == -1) {
                        rval = -1;
                        goto out;
                    }

                    mysideno = getmyside(sp, &xep);

                    if (mysideno == MD_SIDEWILD) {
                        rval = -1;
                        goto out;
                    }

                    if (sd->sd_isown[mysideno] == FALSE)
                        if (halt_set(sp, &xep)) {
                            rval = -1;
                            goto out;
                        }

                    rval = -1;
                    goto out;
                }
            }
        }

        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            /* Skip non local nodes */
            if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
                continue;

            if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
                mdclrerror(ep);
        }

        /*
         * Go thru each drive and individually delete the replicas.
         * This way we can ignore individual errors.
         */
        for (p = dd; p != NULL; p = p->dd_next) {
            uint_t  rep_slice;

            dnp = p->dd_dnp;
            if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
                (((np = metaslicename(dnp, rep_slice, ep))
                == NULL) &&
                ((np = metaslicename(dnp, MD_SLICE0, ep))
                == NULL))) {
                rval = -1;
                goto out;
            }

            if ((np = metaslicename(dnp,
                rep_slice, ep)) == NULL) {
                if ((np = metaslicename(dnp,
                    MD_SLICE0, ep)) == NULL) {
                    rval = -1;
                    goto out;
                }
                mdclrerror(ep);
            }

            /* Yes this is UGLY!!! */
            p1 = p->dd_next;
            p->dd_next = NULL;
            if (rel_own_bydd(sp, p, FALSE, ep))
                mdclrerror(ep);
            p->dd_next = p1;

            if (p->dd_dbcnt == 0)
                continue;

            /*
             * Skip the replica removal if we are not the last user
             */
            if (num_users != 1)
                continue;

            nlp = NULL;
            (void) metanamelist_append(&nlp, np);
            if (meta_db_detach(sp, nlp,
                (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
                mdclrerror(ep);
            metafreenamelist(nlp);
        }
    }

    if (halt_set(sp, ep)) {
        rval = -1;
        goto out;
    }

    /* Setup the mediator record */
    (void) memset(&medr, '\0', sizeof (med_rec_t));
    medr.med_rec_mag = MED_REC_MAGIC;
    medr.med_rec_rev = MED_REC_REV;
    medr.med_rec_fl  = 0;
    medr.med_rec_sn  = sp->setno;
    (void) strcpy(medr.med_rec_snm, sp->setname);
    medr.med_rec_meds = sd->sd_med; /* structure assigment */
    (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
    medr.med_rec_foff = 0;

    /*
     * If we are the last remaining user, then remove the mediator hosts
     */
    if (num_users == 1) {
        for (i = 0; i < MED_MAX_HOSTS; i++) {
            if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
                SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
                    SVM_TAG_MEDIATOR, sp->setno, i);
            (void) memset(&medr.med_rec_meds.n_lst[i], '\0',
                sizeof (md_h_t));
        }
        medr.med_rec_meds.n_cnt = 0;
    } else {    /* Remove this host from the mediator node list. */
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            /* Copy non local node */
            if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
                (void) strcpy(medr.med_rec_nodes[i],
                    sd->sd_nodes[i]);
                continue;
            }

            /* Clear local node */
            (void) memset(&medr.med_rec_nodes[i], '\0',
                sizeof (md_node_nm_t));
        }
    }

    crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);

    /*
     * If the client is part of a cluster put the DCS service
     * into a deleteing state.
     */
    if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
        if (metad_isautotakebyname(sp->setname)) {
            delete_end = 0;
        } else {
            mdclrerror(ep);
            goto out;
        }
    }

    /* Inform the mediator hosts of the new information */
    for (i = 0; i < MED_MAX_HOSTS; i++) {
        if (sd->sd_med.n_lst[i].a_cnt == 0)
            continue;

        if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
            mdclrerror(ep);
    }

    /* Delete the set locally */
    for (i = 0; i < MD_MAXSIDES; i++) {
        /* Skip empty slots */
        if (sd->sd_nodes[i][0] == '\0')
            continue;

        /* Skip non local nodes */
        if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
            continue;

        if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
            mdclrerror(ep);
    }
    if (delete_end &&
        sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
        rval = -1;

out:
    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0) {
        if (rval == 0)
            (void) mdstealerror(ep, &xep);
        rval = -1;
    }

    if (lock_set == TRUE) {
        cl_sk = cl_get_setkey(sp->setno, sp->setname);
        if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
            if (rval == 0)
                (void) mdstealerror(ep, &xep);
            rval = -1;
        }
        cl_set_setkey(NULL);
    }

    metaflushsetname(sp);
    return (rval);
}

int
meta_set_purge(
    mdsetname_t *sp,
    int     bypass_cluster,
    int     forceflg,
    md_error_t  *ep
)
{
    char        *thishost = mynode();
    md_set_desc *sd;
    md_setkey_t *cl_sk;
    md_error_t  xep = mdnullerror;
    int     rval = 0;
    int     i, num_hosts = 0;
    int     has_set = 0;
    int     max_node = 0;
    int     delete_end = 1;
    md_mnnode_desc  *nd;

    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        /* unable to find set description */
        rval = 1;
        return (rval);
    }

    if (MD_MNSET_DESC(sd)) {
        /*
         * Get a count of the hosts in the set and also lock the set
         * on those hosts that know about it.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /*
             * Only deal with those nodes that are members of
             * the set (MD_MN_NODE_ALIVE) or the node on which
             * the purge is being run. We must lock the set
             * on the purging node because the delset call
             * requires the lock to be set.
             */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE) &&
                nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
                nd = nd->nd_next;
                continue;
            }
            has_set = nodehasset(sp, nd->nd_nodename,
                NHS_NST_EQ, ep);

            /*
             * The host is not aware of this set (has_set < 0) or
             * the set does not match (has_set == 0). This check
             * prevents the code getting confused by an apparent
             * inconsistancy in the set's state, this is in the
             * purge code so something is broken in any case and
             * this is just trying to fix the brokeness.
             */
            if (has_set <= 0) {
                mdclrerror(ep);
                nd->nd_flags |= MD_MN_NODE_NOSET;
            } else {
                num_hosts++;
                if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
                    /*
                     * If the force flag is set then
                     * ignore any RPC failures because we
                     * are only really interested with
                     * the set on local node.
                     */
                    if (forceflg && mdanyrpcerror(ep)) {
                        mdclrerror(ep);
                    } else {
                        /*
                         * set max_node so that in the
                         * unlock code nodes in the
                         * set that have not been
                         * locked are not unlocked.
                         */
                        max_node = nd->nd_nodeid;
                        rval = 2;
                        goto out1;
                    }
                }

            }
            nd = nd->nd_next;
        }
        max_node = 0;
    } else {
        /*
         * Get a count of the hosts in the set and also lock the set
         * on those hosts that know about it.
         */
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            has_set = nodehasset(sp, sd->sd_nodes[i],
                NHS_NST_EQ, ep);

            /*
             * The host is not aware of this set (has_set < 0) or
             * the set does not match (has_set == 0). This check
             * prevents the code getting confused by an apparent
             * inconsistancy in the set's state, this is in the
             * purge code so something is broken in any case and
             * this is just trying to fix the brokeness.
             */
            if (has_set <= 0) {
                mdclrerror(ep);
                /*
                 * set the node to NULL to prevent further
                 * requests to this unresponsive node.
                 */
                sd->sd_nodes[i][0] = '\0';
            } else {
                num_hosts++;
                if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
                    /*
                     * If the force flag is set then
                     * ignore any RPC failures because we
                     * are only really interested with
                     * the set on local node.
                     */
                    if (forceflg && mdanyrpcerror(ep)) {
                        mdclrerror(ep);
                    } else {
                        rval = 2;
                        /*
                         * set max_node so that in the
                         * unlock code nodes in the
                         * set that have not been
                         * locked are not unlocked.
                         */
                        max_node = i;
                        goto out1;
                    }
                }
            }
        }
        max_node = i;   /* now MD_MAXSIDES */
    }
    if (!bypass_cluster) {
        /*
         * If there is only one host associated with the
         * set then remove the set from the cluster.
         */
        if (num_hosts == 1) {
            if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
                if (metad_isautotakebyname(sp->setname)) {
                    delete_end = 0;
                } else {
                    mdclrerror(ep);
                    rval = 3;
                    goto out1;
                }
            }
        }
    }

    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) {
                /*
                 * This is the node on which the purge is
                 * being run. We do not care if it is
                 * alive or not, just want to get rid of
                 * the set.
                 */
                if (clnt_delset(nd->nd_nodename, sp,
                    ep) == -1) {
                    md_perror(dgettext(TEXT_DOMAIN,
                        "delset"));
                    if (!bypass_cluster && num_hosts == 1)
                        (void) sdssc_delete_end(
                            sp->setname, SDSSC_CLEANUP);
                    mdclrerror(ep);
                    goto out1;
                }
                nd = nd->nd_next;
                continue;
            }

            /*
             * Only contact those nodes that are members of
             * the set.
             */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            /*
             * Tell the remote node to remove this node
             */
            if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost,
                ep) == -1) {
                /*
                 * If we fail to delete ourselves
                 * from the remote host it does not
                 * really matter because the set is
                 * being "purged" from this node. The
                 * set can be purged from the other
                 * node at a later time.
                 */
                mdclrerror(ep);
            }
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;
            if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
                /*
                 * Tell the remote node to remove this node
                 */
                if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
                    &thishost, ep) == -1) {
                    /*
                     * If we fail to delete ourselves
                     * from the remote host it does not
                     * really matter because the set is
                     * being "purged" from this node. The
                     * set can be purged from the other
                     * node at a later time.
                     */
                    mdclrerror(ep);
                }
                continue;
            }

            /* remove the set from this host */
            if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
                md_perror(dgettext(TEXT_DOMAIN, "delset"));
                if (!bypass_cluster && num_hosts == 1)
                    (void) sdssc_delete_end(sp->setname,
                        SDSSC_CLEANUP);
                mdclrerror(ep);
                goto out1;
            }
        }
    }

    if (!bypass_cluster && num_hosts == 1) {
        if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
            SDSSC_ERROR) {
            rval = 4;
        }
    }

out1:

    cl_sk = cl_get_setkey(sp->setno, sp->setname);

    /*
     * Remove the set lock on those nodes that had the set locked
     * max_node will either be MD_MAXSIDES or array index of the last
     * node contacted (or rather failed to contact) for traditional
     * diskset.  For a MN diskset, max_node is the node_id of the node
     * that failed the lock.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            if (nd->nd_nodeid == max_node)
                break;
            if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
                if (forceflg && mdanyrpcerror(&xep)) {
                    mdclrerror(&xep);
                    nd = nd->nd_next;
                    continue;
                }
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = 5;
            }
            nd = nd->nd_next;
        }
    } else {
        for (i = 0; i < max_node; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
                if (forceflg && mdanyrpcerror(&xep)) {
                    mdclrerror(&xep);
                    continue;
                }
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = 5;
            }
        }
    }

    cl_set_setkey(NULL);

    return (rval);
}

int
meta_set_query(
    mdsetname_t     *sp,
    mddb_dtag_lst_t     **dtlpp,
    md_error_t      *ep
)
{
    mddb_dtag_get_parm_t    dtgp;

    (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
    dtgp.dtgp_setno = sp->setno;

    /*CONSTCOND*/
    while (1) {
        if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
            if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
                *dtlpp == NULL)
                return (mdstealerror(ep, &dtgp.dtgp_mde));
            else
                break;

        /*
         * Run to the end of the list
         */
        for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
            /* void */;

        *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));

        (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
            sizeof (mddb_dtag_t));

        dtgp.dtgp_dt.dt_id++;
    }
    return (0);
}

/*
 * return drivename get by key
 */
mddrivename_t *
metadrivename_withdrkey(
    mdsetname_t *sp,
    side_t      sideno,
    mdkey_t     key,
    int     flags,
    md_error_t  *ep
)
{
    char        *nm;
    mdname_t    *np;
    mddrivename_t   *dnp;
    ddi_devid_t devidp;
    md_set_desc *sd;

    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        return (NULL);
    }

    /*
     * Get the devid associated with the key.
     *
     * If a devid was returned, it MUST be valid even in
     * the case where a device id has been "updated". The
     * "update" of the device id may have occured due to
     * a firmware upgrade.
     */
    if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
        != NULL) {
        /*
         * Look for the correct dnp using the devid for comparison.
         */
        dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
        free(devidp);

        /* dnp could be NULL if the devid could not be decoded. */
        if (dnp == NULL) {
            return (NULL);
        }
        dnp->side_names_key = key;
    } else {
        /*
         * We didn't get a devid. We'll try for a dnp using the
         * name. If we have a MN diskset or if the dnp is a did
         * device, we're done because then we don't have devids.
         * Otherwise we'll try to set the devid
         * and get the dnp via devid again.
         * We also need to clear the ep structure. When the
         * above call to meta_getdidbykey returned a null, it
         * also put an error code into ep. In this case, the null
         * return is actually OK and any errors can be ignored. The
         * reason it is OK is because this could be a MN set or
         * we could  be running without devids (ex cluster).
         */
        mdclrerror(ep);

        if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key,
            ep)) == NULL)
            return (NULL);
        /* get device name */
        if (flags & PRINT_FAST) {
            if ((np = metaname_fast(&sp, nm,
                LOGICAL_DEVICE, ep)) == NULL) {
                Free(nm);
                return (NULL);
            }
        } else {
            if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
                ep)) == NULL) {
                Free(nm);
                return (NULL);
            }
        }
        Free(nm);
        /* make sure it's OK */
        if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
            ep) != 0))
            return (NULL);

        /* get drivename */
        dnp = np->drivenamep;
        dnp->side_names_key = key;
        /*
         * Skip the devid set/check for the following cases:
         * 1) If MN diskset, there are no devid's
         * 2) if dnp is did device
         * The device id is disabled for did device due to the
         * lack of minor name support in the did driver. The following
         * devid code path can set and propagate the error and
         * eventually prevent did disks from being added to the
         * diskset under SunCluster systems
         *
         * Note that this code can be called through rpc.mdcommd.
         * sdssc_version cannot be used because the library won't
         * be bound.
         */
        if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/"))
            == 0) || (MD_MNSET_DESC(sd)))
            goto out;

        /*
         * It is okay if replica is not in devid mode
         */
        if (mdissyserror(ep, MDDB_F_NODEVID)) {
            mdclrerror(ep);
            goto out;
        }

        /*
         * We're not MN or did devices but
         * devid is missing so this means that we have
         * just upgraded from a configuration where
         * devid's were not used so try to add in
         * the devid and requery. If the devid still isn't there,
         * that's OK. dnp->devid will be null as it is in any
         * configuration with no devids.
         */
        if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0)
            return (NULL);
        if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
            sideno+SKEW, key, ep)) != NULL) {
            /*
             * Found a devid so look for the dnp using the
             * devid as the search mechanism.
             */
            dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
            free(devidp);
            if (dnp == NULL) {
                return (NULL);
            }
            dnp->side_names_key = key;
        }
    }


out:
    if (flags & MD_BYPASS_DAEMON)
        return (dnp);

    if (get_sidenmlist(sp, dnp, ep))
        return (NULL);

    /* return success */
    return (dnp);
}

void
metafreedrivedesc(md_drive_desc **dd)
{
    md_drive_desc   *p, *next = NULL;

    for (p = *dd; p != NULL; p = next) {
        next = p->dd_next;
        Free(p);
    }
    *dd = NULL;
}

md_drive_desc *
metaget_drivedesc(
    mdsetname_t *sp,
    int     flags,
    md_error_t  *ep
)
{
    side_t      sideno = MD_SIDEWILD;

    assert(! (flags & MD_BYPASS_DAEMON));

    if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
        return (NULL);

    return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
}

md_drive_desc *
metaget_drivedesc_fromnamelist(
    mdsetname_t *sp,
    mdnamelist_t    *nlp,
    md_error_t  *ep
)
{
    md_set_desc     *sd;
    mdnamelist_t        *p;
    md_drive_desc       *dd = NULL;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (NULL);

    for (p = nlp; p != NULL; p = p->next)
        (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
            sd->sd_ctime, sd->sd_genid, MD_DR_ADD);

    return (dd);
}

md_drive_desc *
metaget_drivedesc_sideno(
    mdsetname_t *sp,
    side_t sideno,
    int flags,
    md_error_t *ep
)
{
    md_set_desc *sd = NULL;

    assert(! (flags & MD_BYPASS_DAEMON));

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (NULL);

    if (sd->sd_drvs)
        return (sd->sd_drvs);

    if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
        return (NULL);

    return (sd->sd_drvs);
}

int
metaget_setownership(
    mdsetname_t *sp,
    md_error_t  *ep
)
{
    md_set_desc *sd;
    int     bool;
    int     i;
    md_mnnode_desc  *nd;

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            /* If node isn't alive, can't own diskset */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd->nd_flags &= ~MD_MN_NODE_OWN;
                nd = nd->nd_next;
                continue;
            }
            /*
             * If can't communicate with rpc.metad, then mark
             * this node as not an owner.  That node may
             * in fact, be an owner, but without rpc.metad running
             * that node can't do much.
             */
            if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
                nd->nd_flags &= ~MD_MN_NODE_OWN;
            } else if (bool == TRUE) {
                nd->nd_flags |= MD_MN_NODE_OWN;
            } else {
                nd->nd_flags &= ~MD_MN_NODE_OWN;
            }
            nd = nd->nd_next;
        }
        return (0);
    }

    /* Rest of code handles traditional disksets */

    for (i = 0; i < MD_MAXSIDES; i++)
        sd->sd_isown[i] = 0;

    if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
        return (-1);

    if (bool == TRUE)
        sd->sd_isown[getmyside(sp, ep)] = 1;

    return (0);
}

char *
mynode(void)
{
    static struct utsname   myuname;
    static int      done = 0;

    if (! done) {
        if (uname(&myuname) == -1) {
            md_perror(dgettext(TEXT_DOMAIN, "uname"));
            assert(0);
        }
        done = 1;
    }
    return (myuname.nodename);
}

int
strinlst(char *str, int cnt, char **lst)
{
    int i;

    for (i = 0; i < cnt; i++)
        if (strcmp(lst[i], str) == 0)
            return (TRUE);

    return (FALSE);
}

/*
 * meta_get_reserved_names
 *  returns an mdnamelist_t of reserved slices
 *  reserved slices are those that are used but don't necessarily
 *  show up as metadevices (ex. reserved slice for db in sets, logs)
 */

/*ARGSUSED*/
int
meta_get_reserved_names(
    mdsetname_t *sp,
    mdnamelist_t    **nlpp,
    int     options,
    md_error_t  *ep)
{
    int      count      = 0;
    mdname_t    *np     = NULL;
    mdnamelist_t    *transnlp   = NULL;
    mdnamelist_t    **tailpp    = nlpp;
    mdnamelist_t    *nlp;
    md_drive_desc   *dd, *di;

    if (metaislocalset(sp))
        goto out;

    if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
        count = -1;
        goto out;
    }

    /* db in for sets on reserved slice */
    for (di = dd; di && count >= 0; di = di->dd_next) {
        uint_t  rep_slice;

        /*
         * Add the name struct to the end of the
         * namelist but keep a pointer to the last
         * element so that we don't incur the overhead
         * of traversing the list each time
         */
        if (di->dd_dnp &&
            (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
            (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
            (tailpp = meta_namelist_append_wrapper(tailpp, np)))
            count++;
        else
            count = -1;
    }

    /* now find logs */
    if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
        count = -1;
        goto out;
    }

    for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
        mdname_t    *transnp = nlp->namep;
        md_trans_t  *transp;

        if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
            count = -1;
            goto out;
        }
        if (transp->lognamep) {
            /*
             * Add the name struct to the end of the
             * namelist but keep a pointer to the last
             * element so that we don't incur the overhead
             * of traversing the list each time
             */
            tailpp = meta_namelist_append_wrapper(
                tailpp, transp->lognamep);
        }
    }
out:
    metafreenamelist(transnlp);
    return (count);
}

/*
 * Entry point to join a node to MultiNode diskset.
 *
 * Validate host in diskset.
 *  - Should be in membership list from API
 *  - Should not already be joined into diskset.
 *  - Set must have drives
 * Assume valid configuration is stored in the set/drive/node records
 * in the local mddb since no node or drive can be added to the MNset
 * unless all drives and nodes are available.  Reconfig steps will
 * resync all ALIVE nodes in case of panic in critical areas.
 *
 * Lock down the set.
 * Verify host is a member of this diskset.
 * If drives exist in the configuration, load the mddbs.
 * Set this node to active by notifying master if one exists.
 * If this is the first node active in the diskset, this node
 *  becomes the master.
 * Unlock the set.
 *
 * Mirror Resync:
 * If this node is the last node to join the set and clustering
 * isn't running, then start the 'metasync -r' type resync
 * on all mirrors in this diskset.
 * If clustering is running, this resync operation will
 * be handled by the reconfig steps and should NOT
 * be handled during a join operation.
 *
 * There are multiple return values in order to assist
 * the join operation of all sets in the metaset command.
 *
 * Return values:
 *  0  - Node successfully joined to set.
 *  -1 - Join attempted but failed
 *      - any failure from libmeta calls
 *      - node not in the member list
 *  -2 - Join not attempted since
 *      - this set had no drives in set
 *      - this node already joined to set
 *      - set is not a multinode set
 *  -3 - Node joined to STALE set.
 */
extern int
meta_set_join(
    mdsetname_t *sp,
    md_error_t  *ep
)
{
    md_set_desc     *sd;
    md_drive_desc       *dd;
    md_mnnode_desc      *nd, *nd2, my_nd;
    int         rval = 0;
    md_setkey_t     *cl_sk;
    md_error_t      xep = mdnullerror;
    md_error_t      ep_snarf = mdnullerror;
    int         master_flag = 0;
    md_mnset_record     *mas_mnsr = NULL;
    int         clear_nr_flags = 0;
    md_mnnode_record    *nr;
    int         stale_set = 0;
    int         rb_flags = 0;
    int         stale_bool = FALSE;
    int         suspendall_flag = 0;
    int         suspend1_flag = 0;
    sigset_t        oldsigs;
    int         send_reinit = 0;

    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        return (-1);
    }

    /* Must be a multinode diskset */
    if (!MD_MNSET_DESC(sd)) {
        (void) mderror(ep, MDE_NOT_MN, sp->setname);
        return (-2);
    }

    /* Verify that the node is ALIVE (i.e. is in the API membership list) */
    if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
        (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
            sd->sd_mn_mynode->nd_nodename, NULL, sp->setname);
        return (-1);
    }

    /* Make sure we are blocking all signals */
    if (procsigs(TRUE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    /*
     * Lock the set on current set members.
     * For MN diskset lock_set and SUSPEND are used to protect against
     * other meta* commands running on the other nodes.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
            rval = -1;
            goto out;
        }
        nd = nd->nd_next;
    }

    /*
     * Lock out other meta* commands by suspending
     * class 1 messages across the diskset.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
            sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
            rval = -1;
            goto out;
        }
        suspend1_flag = 1;
        nd = nd->nd_next;
    }

    /*
     * Verify that this host is a member (in the host list) of the set.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (strcmp(mynode(), nd->nd_nodename) == 0) {
            break;
        }
        nd = nd->nd_next;
    }
    if (!nd) {
        (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
            sd->sd_mn_mynode->nd_nodename, NULL,
            sp->setname);
        rval = -1;
        goto out;
    }

    /*
     * Need to return failure if host is already 'joined'
     * into the set.  This is done so that if later the user
     * issues a command to join all sets and a failure is
     * encountered - that the resulting cleanup effort
     * (withdrawing from all sets that were joined
     * during that command) won't withdraw from this set.
     */
    if (nd->nd_flags & MD_MN_NODE_OWN) {
        rval = -2;
        goto out2;
    }

    /*
     * Call metaget_setownership that calls each node in diskset and
     * marks in set descriptor if node is an owner of the set or not.
     * metaget_setownership checks to see if a node is an owner by
     * checking to see if that node's kernel has the mddb loaded.
     * If a node had panic'd during a reconfig or an
     * add/delete/join/withdraw operation, the other nodes' node
     * records may not reflect the current state of the diskset,
     * so calling metaget_setownership is the safest thing to do.
     */
    if (metaget_setownership(sp, ep) == -1) {
        rval = -1;
        goto out;
    }

    /* If first active member of diskset, become the master. */
    nd = sd->sd_nodelist;
    while (nd) {
        if (nd->nd_flags & MD_MN_NODE_OWN)
            break;
        nd = nd->nd_next;
    }
    if (nd == NULL)
        master_flag = 1;

    /*
     * If not first active member of diskset, then get the
     * master information from a node that is already joined
     * and set the master information for this node.  Be sure
     * that this node (the already joined node) has its own
     * join flag set.  If not, then this diskset isn't currently
     * consistent and shouldn't allow a node to join.  This diskset
     * inconsistency should only occur when a node has panic'd in
     * the set while doing a metaset operation and the sysadmin is
     * attempting to join a node into the set.  This inconsistency
     * will be fixed during a reconfig cycle which should be occurring
     * soon since a node panic'd.
     *
     * If unable to get this information from an owning node, then
     * this diskset isn't currently consistent and shouldn't
     * allow a node to join.
     */
    if (!master_flag) {
        /* get master information from an owner (joined) node */
        if (clnt_mngetset(nd->nd_nodename, sp->setname,
            sp->setno, &mas_mnsr, ep) == -1) {
            rval = -1;
            goto out;
        }

        /* Verify that owner (joined) node has its own JOIN flag set */
        nr = mas_mnsr->sr_nodechain;
        while (nr) {
            if ((nd->nd_nodeid == nr->nr_nodeid) &&
                ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
                (void) mddserror(ep, MDE_DS_NODENOSET,
                    sp->setno, nd->nd_nodename, NULL,
                    nd->nd_nodename);
                free_sr((md_set_record *)mas_mnsr);
                rval = -1;
                goto out;
            }
            nr = nr->nr_next;
        }

        /*
         * Does master have set marked as STALE?
         * If so, need to pass this down to kernel when
         * this node snarfs the set.
         */
        if (clnt_mn_is_stale(nd->nd_nodename, sp,
            &stale_bool, ep) == -1) {
            rval = -1;
            goto out;
        }

        /* set master information in my rpc.metad's set record */
        if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
            mas_mnsr->sr_master_nodeid, ep)) {
            free_sr((md_set_record *)mas_mnsr);
            rval = -1;
            goto out;
        }

        /* set master information in my cached set desc */
        (void) strcpy(sd->sd_mn_master_nodenm,
            mas_mnsr->sr_master_nodenm);
        sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
        nd2 = sd->sd_nodelist;
        while (nd2) {
            if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
                sd->sd_mn_masternode = nd2;
                break;
            }
            nd2 = nd2->nd_next;
        }
        free_sr((md_set_record *)mas_mnsr);

        /*
         * Set the node flags in mynode's rpc.metad node records for
         * the nodes that are in the diskset.  Can use my sd
         * since earlier call to metaget_setownership set the
         * owner flags based on whether that node had snarfed
         * the MN diskset mddb.  Reconfig steps guarantee that
         * return of metaget_setownership will match the owning
         * node's owner list except in the case where a node
         * has just panic'd and in this case, a reconfig will
         * be starting immediately and the owner lists will
         * be sync'd up by the reconfig.
         *
         * Flag of SET means to take no action except to
         * set the node flags as given in the nodelist linked list.
         */
        if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
            MD_NR_SET, NULL, ep)) {
            rval = -1;
            goto out;
        }
    }

    /*
     * Read in the mddb if there are drives in the set.
     */
    if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep)) == NULL) {
        /* No drives in list */
        if (! mdisok(ep)) {
            rval = -1;
            goto out;
        }
        rval = -2;
        goto out;
    }

    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * Start by suspending rpc.mdcommd (which drains it of all messages),
     * then change the nodelist followed by a reinit and resume.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }

        if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
            MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
            rval = -1;
            goto out;
        }
        suspendall_flag = 1;
        nd = nd->nd_next;
    }

    /* Set master in my set record in rpc.metad */
    if (master_flag) {
        if (clnt_mnsetmaster(mynode(), sp,
            sd->sd_mn_mynode->nd_nodename,
            sd->sd_mn_mynode->nd_nodeid, ep)) {
            rval = -1;
            goto out;
        }
    }
    /*
     * Causes mddbs to be loaded into the kernel.
     * Set the force flag so that replica locations can be
     * loaded into the kernel even if a mediator node was
     * unavailable.  This allows a node to join an MO
     * diskset when there are sufficient replicas available,
     * but a mediator node in unavailable.
     */
    if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
        mde_perror(ep, dgettext(TEXT_DOMAIN,
            "Host not able to start diskset."));
        rval = -1;
        goto out;
    }

    if (! mdisok(ep)) {
        rval = -1;
        goto out;
    }

    /*
     * Set rollback flags to 1 so that halt_set is called if a failure
     * is seen after this point.  If snarf_set fails, still need to
     * call halt_set to cleanup the diskset.
     */
    rb_flags = 1;

    /* Starts the set */
    if (snarf_set(sp, stale_bool, ep) != 0) {
        if (mdismddberror(ep, MDE_DB_STALE)) {
            /*
             * Don't fail join, STALE means that set has
             * < 50% mddbs.
             */
            (void) mdstealerror(&ep_snarf, ep);
            stale_set = 1;
        } else if (mdisok(ep)) {
            /* If snarf failed, but no error was set - set it */
            (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
                sp->setno, 0, NULL);
                rval = -1;
                goto out;
        } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
            /*
             * Don't fail join if ACCOK; ACCOK means that mediator
             * provided extra vote.
             */
            rval = -1;
            goto out;
        }
    }

    /* Did set really get snarfed? */
    if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
        if (mdisok(ep)) {
            /* If snarf failed, but no error was set - set it */
            (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
                sp->setno, 0, NULL);
        }
        mde_perror(ep, dgettext(TEXT_DOMAIN,
            "Host not able to start diskset."));
        rval = -1;
        goto out;
    }

    /* Change to nodelist so need to send reinit to rpc.mdcommd */
    send_reinit = 1;

    /* If first node to enter set, setup master and clear change log */
    if (master_flag) {
        /* Set master in my locally cached set descriptor */
        (void) strcpy(sd->sd_mn_master_nodenm,
            sd->sd_mn_mynode->nd_nodename);
        sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
        sd->sd_mn_am_i_master = 1;

        /*
         * If first node to join set, then clear out change log
         * entries.  Change log entries are only needed when a
         * change of master is occurring in a diskset that has
         * multiple owners.   Since this node is the first owner
         * of the diskset, clear the entries.
         *
         * Only do this if we are in a single node non-SC3.x
         * situation.
         */
        if (meta_mn_singlenode() &&
            mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
            mde_perror(ep, dgettext(TEXT_DOMAIN,
                "Unable to reset changelog."));
            rval = -1;
            goto out;
        }
    }

    /* Set my locally cached flag */
    sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;

    /*
     * Set this node's own flag on all joined nodes in the set
     * (including my node).
     */
    clear_nr_flags = 1;

    my_nd = *(sd->sd_mn_mynode);
    my_nd.nd_next = NULL;
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
            MD_NR_JOIN, NULL, ep)) {
            rval = -1;
            goto out;
        }
        nd = nd->nd_next;
    }

out:
    if (rval != NULL) {
        /*
         * If rollback flag is 1, then node was joined to set.
         * Since an error occurred, withdraw node from set in
         * order to rollback to before command was run.
         * Need to preserve ep so that calling function can
         * get error information.
         */
        if (rb_flags == 1) {
            if (halt_set(sp, &xep)) {
                mdclrerror(&xep);
            }
        }

        /*
         * If error, reset master to INVALID.
         * Ignore error since (next) first node to successfully join
         * will set master on all nodes.
         */
        (void) clnt_mnsetmaster(mynode(), sp, "",
            MD_MN_INVALID_NID, &xep);
        mdclrerror(&xep);
        /* Reset master in my locally cached set descriptor */
        sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
        sd->sd_mn_am_i_master = 0;

        /*
         * If nr flags set on other nodes, reset them.
         */
        if (clear_nr_flags) {
            nd = sd->sd_nodelist;
            while (nd) {
                if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
                    nd = nd->nd_next;
                    continue;
                }
                (void) clnt_upd_nr_flags(nd->nd_nodename, sp,
                    &my_nd, MD_NR_WITHDRAW, NULL, &xep);
                mdclrerror(&xep);
                nd = nd->nd_next;
            }
            /* Reset my locally cached flag */
            sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
        }
    }

    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * Send reinit command to mdcommd which forces it to get
     * fresh set description.
     */
    if (send_reinit) {
        /* Send reinit */
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            /* Class is ignored for REINIT */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
                sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
                /*
                 * We are here because we failed to resume
                 * rpc.mdcommd.  However we potentially have
                 * an error from the previous call
                 * If the previous call did fail,  we capture
                 * that error and generate a perror with
                 * the string, "Unable to resume...".
                 * Setting rval to -1 ensures that in the
                 * next iteration of the loop, ep is not
                 * clobbered.
                 */
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                else
                    mdclrerror(&xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to reinit rpc.mdcommd."));
            }
            nd = nd->nd_next;
        }

    }

out2:
    /*
     * Unlock diskset by resuming messages across the diskset.
     * Just resume all classes so that resume is the same whether
     * just one class was locked or all classes were locked.
     */
    if ((suspend1_flag) || (suspendall_flag)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                /*
                 * We are here because we failed to resume
                 * rpc.mdcommd.  However we potentially have
                 * an error from the previous call
                 * If the previous call did fail,  we capture
                 * that error and generate a perror with
                 * the string, "Unable to resume...".
                 * Setting rval to -1 ensures that in the
                 * next iteration of the loop, ep is not
                 * clobbered.
                 */
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                else
                    mdclrerror(&xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd."));
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }

    /*
     * Unlock set.  This flushes the caches on the servers.
     */
    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
            if (rval == 0)
                (void) mdstealerror(ep, &xep);
            else
                mdclrerror(&xep);
            rval = -1;
        }
        nd = nd->nd_next;
    }

    /*
     * If this node is the last to join the diskset and clustering isn't
     * running, then resync the mirrors in the diskset. We have to wait
     * until all nodes are joined so that the status gets propagated to
     * all of the members of the set.
     * Ignore any error from the resync as the join function shouldn't fail
     * because the mirror resync had a problem.
     *
     * Don't start resync if set is stale.
     */
    if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
        (stale_set != 1)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_OWN))
                break;
            nd = nd->nd_next;
        }
        /*
         * nd set to NULL means that we have no nodes in the set that
         * haven't joined. In this case we start the resync.
         */
        if (nd == NULL) {
            (void) meta_mirror_resync_all(sp, 0, &xep);
            mdclrerror(&xep);
        }
    }

    /* Update ABR state for all soft partitions */
    (void) meta_sp_update_abr(sp, &xep);
    mdclrerror(&xep);

    /*
     * call metaflushsetnames to reset local cache for master and
     * node information.
     */
    metaflushsetname(sp);

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    /*
     * If no error and stale_set is set, then set ep back
     * to ep from snarf_set call and return -3.  If another error
     * occurred and rval is not 0, then that error would have
     * caused the node to be withdrawn from the set and would
     * have set ep to that error information.
     */
    if ((rval == 0) && (stale_set)) {
        (void) mdstealerror(ep, &ep_snarf);
        return (-3);
    }

    return (rval);
}

/*
 * Entry point to withdraw a node from MultiNode diskset.
 *
 * Validate host in diskset.
 *  - Should be joined into diskset.
 * Assume valid configuration is stored in the set/drive/node records
 * in the local mddb since no node or drive can be added to the MNset
 * unless all drives and nodes are available.  Reconfig steps will
 * resync all ALIVE nodes in case of panic in critical areas.
 *
 * Lock down the set.
 * Verify that drives exist in configuration.
 * Verify host is a member of this diskset.
 * Verify host is an owner of the diskset (host is joined to diskset).
 * Only allow withdrawal of master node if master node is the only joined
 * in the diskset.
 * Halt the diskset on this node.
 * Reset Master on this node.
 * Updated node flags that this node with withdrawn.
 * Unlock the set.
 *
 * Return values:
 *  0  - Node successfully withdrew from set.
 *  -1 - Withdrawal attempted but failed
 *      - any failure from libmeta calls
 *      - node not in the member list
 *  -2 - Withdrawal not attempted since
 *      - this set had no drives in set
 *      - this node not joined to set
 *      - set is not a multinode set
 */
extern int
meta_set_withdraw(
    mdsetname_t *sp,
    md_error_t  *ep
)
{
    md_set_desc     *sd;
    md_drive_desc       *dd = 0;
    md_mnnode_desc      *nd, my_nd;
    int         rval = 0;
    md_setkey_t     *cl_sk;
    md_error_t      xep = mdnullerror;
    int         set_halted = 0;
    int         suspendall_flag = 0;
    int         suspend1_flag = 0;
    bool_t          stale_bool = FALSE;
    mddb_config_t       c;
    int         node_id_list[1];
    sigset_t        oldsigs;
    int         send_reinit = 0;

    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        return (-1);
    }

    /* Must be a multinode diskset */
    if (!MD_MNSET_DESC(sd)) {
        (void) mderror(ep, MDE_NOT_MN, sp->setname);
        return (-1);
    }

    /* Make sure we are blocking all signals */
    if (procsigs(TRUE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    /*
     * Lock the set on current set members.
     * For MN diskset lock_set and SUSPEND are used to protect against
     * other meta* commands running on the other nodes.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
            rval = -1;
            goto out;
        }
        nd = nd->nd_next;
    }
    /*
     * Lock out other meta* commands by suspending
     * class 1 messages across the diskset.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
            sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
            rval = -1;
            goto out;
        }
        suspend1_flag = 1;
        nd = nd->nd_next;
    }

    /* Get list of drives - needed in case of failure */
    if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep)) == NULL) {
        /* Error getting drives in list */
        if (! mdisok(ep)) {
            rval = -1;
            goto out2;
        }
        /* no drives in list */
        rval = -2;
        goto out2;
    }

    /*
     * Verify that this host is a member (in the host list) of the set.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (strcmp(mynode(), nd->nd_nodename) == 0) {
            break;
        }
        nd = nd->nd_next;
    }
    if (!nd) {
        (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
            sd->sd_mn_mynode->nd_nodename, NULL,
            sp->setname);
        rval = -1;
        goto out2;
    }

    /*
     * Call metaget_setownership that calls each node in diskset and
     * marks in set descriptor if node is an owner of the set or not.
     * metaget_setownership checks to see if a node is an owner by
     * checking to see if that node's kernel has the mddb loaded.
     * If a node had panic'd during a reconfig or an
     * add/delete/join/withdraw operation, the other nodes' node
     * records may not reflect the current state of the diskset,
     * so calling metaget_setownership is the safest thing to do.
     */
    if (metaget_setownership(sp, ep) == -1) {
        rval = -1;
        goto out2;
    }

    /*
     * Verify that this node is joined
     * to diskset (i.e. is an owner of the diskset).
     */
    if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
        rval = -2;
        goto out2;
    }

    /*
     * For a MN diskset, only withdraw master if it is
     * the only joined node.
     */
    if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
        nd = sd->sd_nodelist;
        while (nd) {
            /* Skip my node since checking for other owners */
            if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
                nd = nd->nd_next;
                continue;
            }
            /* If another owner node if found, error */
            if (nd->nd_flags & MD_MN_NODE_OWN) {
                (void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
                    sp->setno,
                    sd->sd_mn_mynode->nd_nodename, NULL,
                    sp->setname);
                rval = -1;
                goto out2;
            }
            nd = nd->nd_next;
        }
    }

    /*
     * Is current set STALE?
     */
    (void) memset(&c, 0, sizeof (c));
    c.c_id = 0;
    c.c_setno = sp->setno;
    if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
        (void) mdstealerror(ep, &c.c_mde);
        rval = -1;
        goto out;
    }
    if (c.c_flags & MDDB_C_STALE) {
        stale_bool = TRUE;
    }

    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * Start by suspending rpc.mdcommd (which drains it of all messages),
     * then change the nodelist followed by a reinit and resume.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }

        if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
            sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
            rval = -1;
            goto out;
        }
        suspendall_flag = 1;
        nd = nd->nd_next;
    }

    /*
     * Withdraw the set - halt set.
     * This will fail if any I/O is occuring to any metadevice which
     * includes a resync to a mirror metadevice.
     */
    set_halted = 1;
    if (halt_set(sp, ep)) {
        /* Was set actually halted? */
        if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
            set_halted = 0;
        }
        rval = -1;
        goto out;
    }

    /* Change to nodelist so need to send reinit to rpc.mdcommd */
    send_reinit = 1;

    /* Reset master on withdrawn node */
    if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
        MD_MN_INVALID_NID, ep)) {
        rval = -1;
        goto out;
    }

    /* Mark my node as withdrawn and send to other nodes */
    nd = sd->sd_nodelist;
    my_nd = *(sd->sd_mn_mynode);    /* structure copy */
    my_nd.nd_next = NULL;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
            MD_NR_WITHDRAW, NULL, ep)) {
            rval = -1;
            goto out;
        }
        nd = nd->nd_next;
    }

    /*
     * If withdrawn node is a mirror owner, reset mirror owner
     * to NULL.  If an error occurs, print a warning and continue.
     * Don't fail metaset because of mirror owner reset problem since
     * next node to grab mirror will resolve this issue.
     * Before next node grabs mirrors, metaset will show the withdrawn
     * node as owner which is why an attempt to reset the mirror owner
     * is made.
     */
    node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;  /* Setup my nodeid */
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
            1, &node_id_list[0], &xep) == 01) {
            mde_perror(&xep, dgettext(TEXT_DOMAIN,
                "Unable to reset mirror owner on node %s"),
                nd->nd_nodename);
            mdclrerror(&xep);
        }
        nd = nd->nd_next;
    }

out:
    if (rval == -1) {
        /* Rejoin node - Mark node as joined and send to other nodes */
        nd = sd->sd_nodelist;
        my_nd = *(sd->sd_mn_mynode);    /* structure copy */
        my_nd.nd_next = NULL;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
                MD_NR_JOIN, NULL, &xep)) {
                mdclrerror(&xep);
            }
            nd = nd->nd_next;
        }

        /* Set master on withdrawn node */
        if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
            sd->sd_mn_master_nodenm,
            sd->sd_mn_master_nodeid, &xep)) {
            mdclrerror(&xep);
        }

        /* Join set if halt_set had succeeded */
        if (set_halted) {
            /*
             * Causes mddbs to be loaded into the kernel.
             * Set the force flag so that replica locations can be
             * loaded into the kernel even if a mediator node was
             * unavailable.  This allows a node to join an MO
             * diskset when there are sufficient replicas available,
             * but a mediator node in unavailable.
             */
            if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
                mdclrerror(&xep);
            }
            /* If set previously stale - make it so at re-join */
            if (snarf_set(sp, stale_bool, &xep) != 0) {
                mdclrerror(&xep);
                (void) halt_set(sp, &xep);
                mdclrerror(&xep);
            }
        }
    }

    /*
     * Notify rpc.mdcommd on all nodes of a nodelist change.
     * Send reinit command to mdcommd which forces it to get
     * fresh set description.
     */
    if (send_reinit) {
        /* Send reinit */
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            /* Class is ignored for REINIT */
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
                sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
                /*
                 * We are here because we failed to resume
                 * rpc.mdcommd.  However we potentially have
                 * an error from the previous call.
                 * If the previous call did fail,  we
                 * capture that error and generate a perror
                 * withthe string,  "Unable to resume...".
                 * Setting rval to -1 ensures that in the
                 * next iteration of the loop, ep is not
                 * clobbered.
                 */
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                else
                    mdclrerror(&xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to reinit rpc.mdcommd."));
            }
            nd = nd->nd_next;
        }
    }

out2:
    /*
     * Unlock diskset by resuming messages across the diskset.
     * Just resume all classes so that resume is the same whether
     * just one class was locked or all classes were locked.
     */
    if ((suspend1_flag) || (suspendall_flag)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                /*
                 * We are here because we failed to resume
                 * rpc.mdcommd.  However we potentially have
                 * an error from the previous call
                 * If the previous call did fail,  we capture
                 * that error and generate a perror with
                 * the string, "Unable to resume...".
                 * Setting rval to -1 ensures that in the
                 * next iteration of the loop, ep is not
                 * clobbered.
                 */
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                else
                    mdclrerror(&xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd."));
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }

    /*
     * Unlock set.  This flushes the caches on the servers.
     */
    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
            if (rval == 0)
                (void) mdstealerror(ep, &xep);
            else
                mdclrerror(&xep);
            rval = -1;
        }
        nd = nd->nd_next;
    }

    /*
     * call metaflushsetnames to reset local cache for master and
     * node information.
     */
    metaflushsetname(sp);

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    return (rval);

}

/*
 * Update nodelist with cluster member information.
 * A node not in the member list will be marked
 * as not ALIVE and not OWN.
 * A node in the member list will be marked ALIVE, but
 * the OWN bit will not be changed.
 *
 * If mynode isn't in the membership list, fail causing
 * another reconfig cycle to be started since a non-member
 * node shouldn't be taking part in the reconfig cycle.
 *
 * Return values:
 *  0 - No problem.
 *  1 - Any failure including RPC failure to my node.
 */
int
meta_reconfig_update_nodelist(
    mdsetname_t         *sp,
    mndiskset_membershiplist_t  *nl,
    md_set_desc         *sd,
    md_error_t          *ep
)
{
    mndiskset_membershiplist_t  *nl2;
    md_mnnode_desc          *nd;
    md_error_t          xep = mdnullerror;
    int             rval = 0;

    /*
     * Walk through nodelist, checking to see if each
     * node is in the member list.
     * If node is not a member, reset ALIVE and OWN node flag.
     * If node is a member, set ALIVE.
     * If mynode's OWN flag gets reset, then halt the diskset on this node.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        nl2 = nl;
        while (nl2) {
            /* If node is in member list, set ALIVE */
            if (nl2->msl_node_id == nd->nd_nodeid) {
                nd->nd_flags |= MD_MN_NODE_ALIVE;
                break;
            } else {
                nl2 = nl2->next;
            }
            /* node is not in member list, mark !ALIVE and !OWN */
            if (nl2 == NULL) {
                /* If node is mynode, then halt set if needed */
                if (strcmp(mynode(), nd->nd_nodename) == 0) {
                    /*
                     * This shouldn't happen, but just
                     * in case...  Any node not in the
                     * membership list should be dead and
                     * not running reconfig step1.
                     */
                    if (nd->nd_flags & MD_MN_NODE_OWN) {
                        if (halt_set(sp, &xep)) {
                            mde_perror(&xep, "");
                            mdclrerror(&xep);
                        }
                    }
                    /*
                     * Return failure since this node
                     * (mynode) is not in the membership
                     * list, but process the rest of the
                     * nodelist first so that rpc.metad
                     * can be updated with the latest
                     * membership information.
                     */
                    (void) mddserror(ep,
                        MDE_DS_NOTINMEMBERLIST,
                        sp->setno, nd->nd_nodename, NULL,
                        sp->setname);
                    rval = 1;
                }
                nd->nd_flags &= ~MD_MN_NODE_ALIVE;
                nd->nd_flags &= ~MD_MN_NODE_OWN;
            }
        }
        nd = nd->nd_next;
    }

    /* Send this information to rpc.metad */
    if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
        MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
        /* Return failure if can't send node flags to rpc.metad */
        if (rval == 0) {
            (void) mdstealerror(ep, &xep);
            rval = 1;
        }
    }
    return (rval);
}

/*
 * Choose master determines the master for a diskset.
 * Each node determines the master on its own and
 * adds this information to its local rpc.metad nodelist
 * and also sends it to the kernel.
 *
 * Nodelist in set descriptor (sd) is sorted in
 * monotonically increasing sequence of nodeid.
 *
 * Return values:
 *  0 - No problem.
 *  205 - There was an RPC problem to another node.
 *  -1 - There was an error.  This could be an RPC error to my node.
 *      This is a catastrophic failure causing node to panic.
 */
int
meta_reconfig_choose_master_for_set(
    mdsetname_t *sp,
    md_set_desc *sd,
    md_error_t  *ep
)
{
    int         is_owner;
    md_mnset_record     *mnsr = NULL;
    int         lowest_alive_nodeid = 0;
    uint_t          master_nodeid;
    md_mnnode_desc      *nd, *nd2;
    md_mnnode_record    *nr;
    md_drive_desc       *dd;
    md_setkey_t     *cl_sk;
    int         rval = 0;
    md_error_t      xep = mdnullerror;
    mddb_setflags_config_t  sf;

    /*
     * Is current node joined to diskset?
     * Don't trust flags, really check to see if mddb is snarfed.
     */
    if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
        /*
         * If a node is joined to the diskset, this node checks
         * to see if the current master of the diskset is valid and
         * is still in the membership list (ALIVE) and is
         * still joined (OWN).  Need to verify if master is
         * really joined - don't trust the flags.  (Can trust
         * ALIVE since set during earlier part of reconfig cycle.)
         * If the current master is valid, still in the membership
         * list and joined, then master is not changed on this node.
         * Just return.
         *
         * Verify that nodeid is valid before accessing masternode.
         */
        if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
            (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
            if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
                &is_owner, ep) == -1) {
                /* If RPC failure to another node return 205 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    sd->sd_mn_master_nodeid)) {
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            } else {
                if (is_owner == TRUE) {

                    meta_mc_log(MC_LOG5, dgettext(
                        TEXT_DOMAIN, "Set %s previous "
                        "master chosen %s (%d): %s"),
                        sp->setname,
                        sd->sd_mn_master_nodenm,
                        sd->sd_mn_master_nodeid,
                        meta_print_hrtime(gethrtime() -
                        start_time));

                    /* Previous master is ok - done */
                    return (0);
                }
            }
        }

        /*
         * If current master is no longer in the membership list or
         * is no longer joined, then this node uses the following
         * algorithm:
         * - node calls RPC routine clnt_ownset to get latest
         *  information on which nodes are owners of diskset.
         *  clnt_ownset checks on each node to see if its kernel
         *  has that diskset snarfed.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* Don't consider node that isn't in member list */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            if (clnt_ownset(nd->nd_nodename, sp,
                &is_owner, ep) == -1) {
                /* If RPC failure to another node return 205 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            }

            /*
             * Set owner flag for each node based on whether
             * that node really has a diskset mddb snarfed in
             * or not.
             */
            if (is_owner == TRUE)
                nd->nd_flags |= MD_MN_NODE_OWN;
            else
                nd->nd_flags &= ~MD_MN_NODE_OWN;

            nd = nd->nd_next;
        }

        /*
         * - node walks through nodelist looking for nodes that are
         *  owners of the diskset that are in the membership list.
         * - for each owner, node calls RPC routine clnt_getset to
         *   see if that node has its node record set to OK.
         * - If so, master is chosen to be this owner node.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* Don't consider node that isn't in member list */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            /* Don't consider a node that isn't an owner */
            if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
                nd = nd->nd_next;
                continue;
            }

            /* Does node has its own node record set to OK? */
            if (clnt_mngetset(nd->nd_nodename, sp->setname,
                MD_SET_BAD, &mnsr, ep) == -1) {
                /* If RPC failure to another node return 205 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            }
            nr = mnsr->sr_nodechain;
            while (nr) {
                if (nd->nd_nodeid == nr->nr_nodeid) {
                    if (nr->nr_flags & MD_MN_NODE_OK) {
                        /* Found a master */
                        free_sr(
                            (md_set_record *)mnsr);
                        goto found_master;
                    }
                }
                nr = nr->nr_next;
            }
            free_sr((md_set_record *)mnsr);
            nd = nd->nd_next;
        }

        /*
         * - If no owner node has its own node record on its own node
         *  set to OK, then this node checks all of the non-owner
         *  nodes that are in the membership list.
         * - for each non-owner, node calls RPC routine clnt_getset to
         *   see if that node has its node record set to OK.
         * - If set doesn't exist, don't choose node for master.
         * - If so, master is chosen to be this non-owner node.
         *
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* Don't consider node that isn't in member list */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            /* Only checking non-owner nodes this time around */
            if (nd->nd_flags & MD_MN_NODE_OWN) {
                nd = nd->nd_next;
                continue;
            }

            /* Does node has its own node record set to OK? */
            if (clnt_mngetset(nd->nd_nodename, sp->setname,
                MD_SET_BAD, &mnsr, ep) == -1) {
                /*
                 * If set doesn't exist on non-owner node,
                 * don't consider this node for master.
                 */
                if (mdiserror(ep, MDE_NO_SET)) {
                    nd = nd->nd_next;
                    continue;
                } else if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    /* RPC failure to another node */
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            }
            nr = mnsr->sr_nodechain;
            while (nr) {
                if (nd->nd_nodeid == nr->nr_nodeid) {
                    if (nr->nr_flags & MD_MN_NODE_OK) {
                        /* Found a master */
                        free_sr(
                            (md_set_record *)mnsr);
                        goto found_master;
                    }
                }
                nr = nr->nr_next;
            }
            free_sr((md_set_record *)mnsr);
            nd = nd->nd_next;
        }

        /*
         * - If no node can be found that has its own node record on
         *  its node to be set to OK, then all alive nodes
         *  were in the process of being added to or deleted
         *  from set.  Each alive node will remove all
         *  information pertaining to this set from its node.
         *
         * If all nodes in set are ALIVE, then call sdssc end routines
         * since set was truly being initially created or destroyed.
         */
        goto delete_set;
    } else {

        /*
         * If node is not joined to diskset, then this
         * node uses the following algorithm:
         * - If unjoined node doesn't have a node record for itself,
         *  just delete the diskset since diskset was in the
         *  process of being created.
         * - node needs to find master of diskset before
         *  reconfig cycle, if a master existed.
         * - node calls RPC routine clnt_ownset to get latest
         *  information on which nodes are owners of diskset.
         *  clnt_ownset checks on each node to see if its
         *  kernel has that diskset snarfed.
         */

        /*
         * Is my node in the set description?
         * If not, delete the set from this node.
         * sr2setdesc sets sd_mn_mynode pointer to the node
         * descriptor for this node if there was a node
         * record for this node.
         *
         */
        if (sd->sd_mn_mynode == NULL) {
            goto delete_set;
        }

        nd = sd->sd_nodelist;
        while (nd) {
            /* Don't consider node that isn't in member list */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            if (clnt_ownset(nd->nd_nodename, sp,
                &is_owner, ep) == -1) {
                /* If RPC failure to another node return 205 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            }

            /*
             * Set owner flag for each node based on whether
             * that node really has a diskset mddb snarfed in
             * or not.
             */
            if (is_owner == TRUE)
                nd->nd_flags |= MD_MN_NODE_OWN;
            else
                nd->nd_flags &= ~MD_MN_NODE_OWN;

            nd = nd->nd_next;
        }

        /*
         * - node walks through nodelist looking for nodes that
         *  are owners of the diskset that are in
         *  the membership list.
         * - for each owner, node calls RPC routine clnt_getset to
         *  see if that node has a master set and to get the
         *  diskset description.
         * - If the owner node has a set description that doesn't
         *  include the non-joined node in the nodelist, this node
         *  removes its set description of that diskset
         *  (i.e. removes the set from its local mddbs).  This is
         *  handling the case of when a node was removed from a
         *  diskset while it was not in the cluster membership
         *  list.
         * - If that node has a master set and the master is in the
         *  membership list and is an owner, then either this was
         *  the master from before the reconfig cycle or this
         *  node has already chosen a new master - either way,
         *  the master value is valid as long as it is in the
         *  membership list and is an owner
         * - master is chosen to be owner node's master
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* Don't consider node that isn't in member list */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            /* Don't consider a node that isn't an owner */
            if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
                nd = nd->nd_next;
                continue;
            }

            /* Get owner node's set record */
            if (clnt_mngetset(nd->nd_nodename, sp->setname,
                MD_SET_BAD, &mnsr, ep) == -1) {
                /* If RPC failure to another node return 205 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            }

            /* Is this node in the owner node's set record */
            nr = mnsr->sr_nodechain;
            while (nr) {
                if (sd->sd_mn_mynode->nd_nodeid ==
                    nr->nr_nodeid) {
                    break;
                }
                nr = nr->nr_next;
            }
            if (nr == NULL) {
                /* my node not found - delete set */
                free_sr((md_set_record *)mnsr);
                goto delete_set;
            }

            /* Is owner's node's master valid? */
            master_nodeid = mnsr->sr_master_nodeid;
            free_sr((md_set_record *)mnsr);
            if (master_nodeid == MD_MN_INVALID_NID) {
                nd = nd->nd_next;
                continue;
            }

            nd2 = sd->sd_nodelist;
            while (nd2) {
                if ((nd2->nd_nodeid == master_nodeid) &&
                    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
                    (nd2->nd_flags & MD_MN_NODE_OWN)) {
                        nd = nd2;
                        goto found_master;
                }
                nd2 = nd2->nd_next;
            }
            nd = nd->nd_next;
        }

        /*
         * - If no owner node has a valid master, then follow
         *  algorithm of when a node is joined to the diskset.
         * - node walks through nodelist looking for nodes that are
         *  owners of the diskset that are in the membership list.
         * - for each owner, node calls RPC routine clnt_getset to
         *   see if that node has its node record set to OK.
         * - If so, master is chosen to be this owner node.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* Don't consider node that isn't in member list */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            /* Don't consider a node that isn't an owner */
            if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
                nd = nd->nd_next;
                continue;
            }

            /* Does node has its own node record set to OK? */
            if (clnt_mngetset(nd->nd_nodename, sp->setname,
                MD_SET_BAD, &mnsr, ep) == -1) {
                /* If RPC failure to another node return 205 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            }
            nr = mnsr->sr_nodechain;
            while (nr) {
                if (nd->nd_nodeid == nr->nr_nodeid) {
                    if (nr->nr_flags & MD_MN_NODE_OK) {
                        /* Found a master */
                        free_sr(
                            (md_set_record *)mnsr);
                        goto found_master;
                    }
                }
                nr = nr->nr_next;
            }
            free_sr((md_set_record *)mnsr);
            nd = nd->nd_next;
        }

        /*
         * - If no owner node has its own node record on its own node
         *  set to OK, then this node checks all of the non-owner
         *  nodes that are in the membership list.
         * - for each non-owner, node calls RPC routine clnt_getset to
         *  see if that node has its node record set to OK.
         * - If set doesn't exist, don't choose node for master.
         * - If this node doesn't exist in the nodelist on any of the
         *  non-owner nodes, this node removes its set description
         *  of that diskset (i.e. removes the set from its local
         *  mddbs). This is handling the case of when a node was
         *  removed from a diskset while it was not in the
         *  cluster membership list.
         * - If non-owner node has its node record set to OK and if
         *  this node hasn't removed this diskset (step directly
         *  before this one), then the master is chosen to be this
         *  non-owner node.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* Don't consider node that isn't in member list */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd->nd_flags |= MD_MN_NODE_DEL;
                nd = nd->nd_next;
                continue;
            }

            /* Don't consider owner nodes since none are OK */
            if (nd->nd_flags & MD_MN_NODE_OWN) {
                nd->nd_flags |= MD_MN_NODE_DEL;
                nd = nd->nd_next;
                continue;
            }

            /*
             * Don't need to get nodelist from my node since
             * this is where sd_nodelist was obtained.
             */
            if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
                nd = nd->nd_next;
                continue;
            }

            /*
             * If node has already been decided against for
             * master, then skip it.
             */
            if (nd->nd_flags & MD_MN_NODE_DEL) {
                nd = nd->nd_next;
                continue;
            }

            /*
             * Does node in my nodelist have its own node
             * record marked OK on its node?  And does node
             * in my nodelist exist on all other nodes?
             * Don't want to choose a node for master unless
             * that node is marked OK on its own node and that
             * node exists on all other alive nodes.
             *
             * This is guarding against the case when several
             * nodes are down and one of the downed nodes is
             * deleted from the diskset.  When the down nodes
             * are rebooted into the cluster, you don't want
             * any node to pick the deleted node as the master.
             */
            if (clnt_mngetset(nd->nd_nodename, sp->setname,
                MD_SET_BAD, &mnsr, ep) == -1) {
                /*
                 * If set doesn't exist on non-owner node,
                 * don't consider this node for master.
                 */
                if (mdiserror(ep, MDE_NO_SET)) {
                    nd->nd_flags |= MD_MN_NODE_DEL;
                    nd = nd->nd_next;
                    continue;
                } else if (mdanyrpcerror(ep)) {
                    /* RPC failure to another node */
                    return (205);
                } else {
                    /* Any other failure */
                    return (-1);
                }
            }
            /*
             * Is my node in the nodelist gotten from the other
             * node?  If not, then remove the set from my node
             * since set was deleted from my node while my node
             * was out of the cluster.
             */
            nr = mnsr->sr_nodechain;
            while (nr) {
                if (sd->sd_mn_mynode->nd_nodeid ==
                    nr->nr_nodeid) {
                    break;
                }
                nr = nr->nr_next;
            }
            if (nr == NULL) {
                /* my node not found - delete set */
                free_sr((md_set_record *)mnsr);
                goto delete_set;
            }

            /* Is node being checked marked OK on its own node? */
            nr = mnsr->sr_nodechain;
            while (nr) {
                if (nd->nd_nodeid == nr->nr_nodeid) {
                    if (!(nr->nr_flags & MD_MN_NODE_OK)) {
                        nd->nd_flags |= MD_MN_NODE_DEL;
                    }
                    break;
                }
                nr = nr->nr_next;
            }
            /*
             * If node being checked doesn't exist on its
             * own node - don't choose it as master.
             */
            if (nr == NULL) {
                nd->nd_flags |= MD_MN_NODE_DEL;
            }

            /*
             * Check every node in my node's nodelist against
             * the nodelist gotten from the other node.
             * If a node in my node's nodelist is not found in the
             * other node's nodelist, then set the DEL flag.
             */
            nd2 = sd->sd_nodelist;
            while (nd2) {
                nr = mnsr->sr_nodechain;
                while (nr) {
                    if (nd2->nd_nodeid == nr->nr_nodeid) {
                        break;
                    }
                    nr = nr->nr_next;
                }
                /* nd2 not found in other node's nodelist */
                if (nr == NULL) {
                    nd2->nd_flags |= MD_MN_NODE_DEL;
                }
                nd2 = nd2->nd_next;
            }

            free_sr((md_set_record *)mnsr);
            nd = nd->nd_next;
        }

        /*
         * Rescan list look for node that has not been marked DEL.
         * First node found is the master.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
                break;
            }
            nd = nd->nd_next;
            continue;
        }
        if (nd) {
            /* Found a master */
            goto found_master;
        }

        /*
         * - If no node can be found that has its own node record on
         *  its node to be set to OK, then all alive nodes
         *  were in the process of being added to or deleted
         *  from set.  Each alive node will remove all
         *  information pertaining to this set from its node.
         *
         * If all nodes in set are ALIVE, then call sdssc end routines
         * since set was truly being initially created or destroyed.
         */
        goto delete_set;
    }

found_master:
    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Set %s master chosen %s (%d): %s"),
        sp->setname, nd->nd_nodename, nd->nd_nodeid,
        meta_print_hrtime(gethrtime() - start_time));

    if (clnt_lock_set(mynode(), sp, ep) == -1) {
        return (-1);
    }

    cl_sk = cl_get_setkey(sp->setno, sp->setname);

    if (clnt_mnsetmaster(mynode(), sp,
        nd->nd_nodename, nd->nd_nodeid, ep)) {
        rval = -1;
    } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
        /* If this node is new master, set flag in this node's kernel */
        (void) memset(&sf, 0, sizeof (sf));
        sf.sf_setno = sp->setno;
        sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
        /* Use magic to help protect ioctl against attack. */
        sf.sf_magic = MDDB_SETFLAGS_MAGIC;
        sf.sf_flags = MDDB_NM_SET;

        meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
            "Setting new master flag for set %s: %s"),
            sp->setname, meta_print_hrtime(gethrtime() - start_time));

        /*
         * Fail reconfig cycle if ioctl fails since it is critical
         * to set new master flag.
         */
        if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
            NULL) != NULL) {
            (void) mdstealerror(ep, &sf.sf_mde);
            rval = -1;
        }
    }

    if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
        if (rval == 0) {
            (void) mdstealerror(ep, &xep);
            rval = -1;
        }
    }

    cl_set_setkey(NULL);

    metaflushsetname(sp);

    return (rval);

delete_set:
    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Master not chosen, deleting set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    /*
     * Remove all set information from this node:
     *  - node records for this set
     *  - drive records for this set
     *  - set record for this set
     * (Only do this on this node since each node
     * will do it for its own local mddb.)
     *
     * If all nodes in set are ALIVE, then
     * the lowest numbered ALIVE nodeid in set
     * (irregardless of whether an owner node or not) will
     * call the DCS service to cleanup for create/delete of set.
     *   sdssc_create_end(cleanup) if set was being created or
     *   sdssc_delete_end(cleanup) if set was being deleted.
     * A node record with flag ADD denotes a set being
     * created.  A node record with flag DEL denotes a
     * set being deleted.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        /* Found a node that isn't alive */
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
            break;

        /* Is my node the lowest numbered ALIVE node? */
        if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
            break;
        }
        nd = nd->nd_next;
    }
    if (nd == NULL) {
        /* All nodes ALIVE and this is the lowest nodeid */
        lowest_alive_nodeid = 1;
    }

    if (clnt_lock_set(mynode(), sp, ep) == -1) {
        return (-1);
    }


    /*
     * If this node had been joined, withdraw and reset master.
     *
     * This could happen if a node was being added to or removed
     * from a diskset and the node doing the add/delete operation and
     * all other nodes in the diskset have left the cluster.
     */
    if (sd->sd_mn_mynode) {
        nd = sd->sd_mn_mynode;
        if (nd->nd_flags & MD_MN_NODE_OWN) {
            if (clnt_withdrawset(mynode(), sp, ep)) {
                rval = -1;
                goto out;
            }
            if (clnt_mnsetmaster(mynode(), sp, "",
                MD_MN_INVALID_NID, ep)) {
                rval = -1;
                goto out;
            }
        }
    }

    /*
     * Remove side records for this node (side) from local mddb
     * (clnt_deldrvs does this) if there are drives in the set.
     *
     * Don't need to mark this node as DEL since already marked as
     * ADD or DEL (or this node would have been chosen as master).
     * Don't need to mark other node records, drive records or
     * set records as DEL.  If a panic occurs during clnt_delset,
     * these records will be deleted the next time this node
     * becomes a member and goes through the reconfig cycle.
     */
    /* Get the drive descriptors for this set */
    if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep)) == NULL) {
        if (! mdisok(ep)) {
            /*
             * Ignore and clear out any failures from
             * metaget_drivedesc since a panic could have
             * occurred when a node was partially added to a set.
             */
            mdclrerror(ep);
        }
    } else {
        if (clnt_deldrvs(mynode(), sp, dd, ep)) {
            rval = -1;
            goto out;
        }
    }

    /*
     * Now, delete the set - this removes the node, drive
     * and set records from the local mddb.
     */
    if (clnt_delset(mynode(), sp, ep)) {
        rval = -1;
        goto out;
    }

out:
    cl_sk = cl_get_setkey(sp->setno, sp->setname);

    /*
     * Ignore errors from unlock of set since set is no longer
     * known (if clnt_delset worked).
     */
    if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
        mdclrerror(&xep);
    }

    cl_set_setkey(NULL);

    metaflushsetname(sp);

    /*
     * If this node is the lowest numbered nodeid then
     * call sdssc_create/delete_end depending on whether
     * this node is marked as ADD or DEL in the node record.
     */
    if (lowest_alive_nodeid) {
        if (nd->nd_flags & MD_MN_NODE_ADD)
            sdssc_create_end(sp->setname, SDSSC_CLEANUP);
        else if (nd->nd_flags & MD_MN_NODE_DEL)
            sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
    }

    /* Finished with this set -- return */
    return (rval);
}

/*
 * Reconfig step to choose a new master for all MN disksets.
 * Return values:
 *  0 - Everything is great.
 *  1 - This node failed to reconfig.
 *  205 - Cause another reconfig due to a nodelist problem
 *      or RPC failure to another node
 */
int
meta_reconfig_choose_master(
    long        timeout,
    md_error_t  *ep
)
{
    set_t               max_sets, setno;
    int             nodecnt;
    mndiskset_membershiplist_t  *nl;
    md_set_desc         *sd;
    mdsetname_t         *sp;
    int             rval = 0;
    mddb_setflags_config_t      sf;
    int             start_node_delayed = 0;

    if ((max_sets = get_max_sets(ep)) == 0) {
        mde_perror(ep, dgettext(TEXT_DOMAIN,
            "Unable to get number of sets"));
        return (1);
    }

    /*
     * Get membershiplist from API routine.  If there's
     * an error, return a 205 to cause another reconfig.
     */
    if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
        mde_perror(ep, "");
        return (205);
    }

    for (setno = 1; setno < max_sets; setno++) {
        if ((sp = metasetnosetname(setno, ep)) == NULL) {
            if (mdiserror(ep, MDE_NO_SET)) {
                /* No set for this setno - continue */
                mdclrerror(ep);
                continue;
            } else {
                /*
                 * If encountered an RPC error from my node,
                 * then immediately fail.
                 */
                if (mdanyrpcerror(ep)) {
                    mde_perror(ep, "");
                    return (1);
                }
                /* Can't get set information */
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to get information for "
                    "set number %d"), setno);
                mdclrerror(ep);
                continue;
            }
        }

        /* If setname is there, set desc should exist. */
        if ((sd = metaget_setdesc(sp, ep)) == NULL) {
            /*
             * If encountered an RPC error from my node,
             * then immediately fail.
             */
            if (mdanyrpcerror(ep)) {
                mde_perror(ep, "");
                return (1);
            }
            mde_perror(ep, dgettext(TEXT_DOMAIN,
                "Unable to get set %s desc information"),
                sp->setname);
            mdclrerror(ep);
            continue;
        }

        /* Only reconfig MN disksets */
        if (!MD_MNSET_DESC(sd)) {
            continue;
        }

        meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
            "Begin choose master for set %s: %s"),
            sp->setname, meta_print_hrtime(gethrtime() - start_time));

        /* Update nodelist with member information. */
        if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
            /*
             * If encountered an RPC error from my node,
             * then immediately fail.
             */
            if (mdanyrpcerror(ep)) {
                mde_perror(ep, "");
                return (1);
            }
            mde_perror(ep, "");
            mdclrerror(ep);
            continue;
        }

        /*
         * If all nodes in a cluster are starting, then
         * all nodes will attempt to contact all other nodes
         * to determine a master node.  This can lead to a
         * problem where node 1 is trying to contact the rpc.metad
         * node 2 and node 2 is trying to contact the rpc.metad
         * on node 1 -- and this causes the rpc call to fail
         * on both nodes and causes a new reconfig cycle.
         *
         * In order to break this problem, a newly starting node
         * will delay a small amount of time (nodeid mod 4 seconds)
         * and will then run the code to choose a master for the
         * first set.  Delay will only be done once regardless of the
         * number of sets.
         */
        if (start_node_delayed == 0) {
            (void) memset(&sf, 0, sizeof (sf));
            sf.sf_setno = sp->setno;
            sf.sf_flags = MDDB_NM_GET;
            /* Use magic to help protect ioctl against attack. */
            sf.sf_magic = MDDB_SETFLAGS_MAGIC;
            if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
                &sf.sf_mde, NULL) == 0) &&
                ((sf.sf_setflags & MD_SET_MN_START_RC) ==
                MD_SET_MN_START_RC)) {
                (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
            }
            start_node_delayed = 1;
        }

        /* Choose master for this set */
        rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
        if (rval == -1) {
            mde_perror(ep, "");
            return (1);
        } else if (rval == 205) {
            mde_perror(ep, "");
            return (205);
        }

        /* reinit rpc.mdcommd with new nodelist */
        if (mdmn_reinit_set(sp->setno, timeout)) {
            md_eprintf(dgettext(TEXT_DOMAIN,
                "Could not re-initialise rpc.mdcommd for "
                "set %s\n"), sp->setname);
            return (1);
        }

        meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
            "Choose master for set %s completed: %s"),
            sp->setname, meta_print_hrtime(gethrtime() - start_time));
    }

    /*
     * Each node turns on I/Os for all MN disksets.
     * This is to recover from the situation where the master died
     * during a previous reconfig cycle when I/Os were suspended
     * for a MN diskset.
     * If a failure occurs return a 1 which will force this node to
     * panic.  Cannot leave node in the situation where I/Os are
     * not resumed.
     */
    setno = 0; /* 0 means all MN sets */
    if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
        mde_perror(ep, "");
        return (1);
    }

    /* Free the nodelist */
    if (nodecnt)
        meta_free_nodelist(nl);

    return (0);
}

/*
 * meta_mnsync_user_records will synchronize the diskset user records across
 * all nodes in the diskset.  The diskset user records are stored in
 * each node's local set mddb.
 *
 * This needs to be done even if there is no master change during the
 * reconfig cycle since this routine should clean up any mess left by
 * the untimely termination of a metaset or metadb command (due to a
 * node panic or to user intervention).
 *
 * Caller is the Master node.
 *
 * Returns   0 - Success
 *      205 - Failure during RPC to another node
 *      -1 - Any other failure and ep is filled in.
 */
int
meta_mnsync_user_records(
    mdsetname_t *sp,
    md_error_t  *ep
)
{
    md_set_desc     *sd;
    md_mnnode_desc      *master_nodelist, *nd, *nd2, *ndtail;
    md_mnset_record     *mnsr;
    md_mnsr_node_t      *master_mnsr_node = NULL, *mnsr_node = NULL;
    md_mnnode_record    *nr;
    md_drive_record     *dr;
    int         dr_cnt, dd_cnt;
    int         found_my_nr;
    md_drive_desc       *dd, *dd_prev, *master_dd, *other_dd;
    int         all_drives_ok;
    int         rval = 0;
    int         max_genid = 0;
    int         num_alive_nodes, num_alive_nodes_del = 0;
    int         set_locked = 0;
    md_setkey_t     *cl_sk;
    md_error_t      xep = mdnullerror;
    char            *anode[1];
    mddb_setflags_config_t  sf;

    /*
     * Sync up node records first.
     * Construct a master nodelist using the nodelist from this
     * node's rpc.metad node records and then setting the state of each
     * node following these rules:
     *  - If a node record is marked OK on its node, mark it OK
     *      in the master nodelist (and later OK on all nodes)
     *      If a node record is also marked OWN on its node,
     *      mark it OWN in the master nodelist.
     *  - If a node record is not marked OK on its node, then mark
     *      it as DEL in the master list (later deleting it)
     *  - If node record doesn't exist on that node, then mark it DEL
     *      (later deleting it)
     *  - If set record doesn't exist on that node, mark node as DEL
     *  - If a node record doesn't exist on all nodes, then mark it DEL
     *  - If a node is not ALIVE, then
     *      - If that node marked DEL on any node - mark it DEL
     *          in master list but leave in nodelist
     *      - If that node is marked as ADD on any node, mark it
     *          ADD in the master list but leave in nodelist
     *      - When that node returns to the living, the DEL
     *          node record will be removed and the ADD node
     *          record may be removed if marked ADD on that
     *          node.
     * The key rule is to not remove a node from the nodelist until
     * that node record is removed from its own node.  Do not want to
     * remove a node's record from all other nodes and then have
     * that node have its own record marked OK so that a node will pick
     * a different master than the other nodes.
     *
     * Next,
     * If node is ALIVE and node record is marked DEL in master nodelist,
     * remove node from set.
     * If node is ALIVE and node record is marked OK in master nodelist,
     * mark it OK on all other nodes.
     * If node is not ALIVE and node record is marked DEL in master
     * nodelist, mark it DEL on all other nodes.
     * If node is not ALIVE and node record is marked ADD in master,
     * nodelist, mark it ADD on all other nodes.
     */
    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        return (-1);
    }
    master_nodelist = sd->sd_nodelist;

    /*
     * Walk through nodelist creating a master nodelist.
     */
    num_alive_nodes = 0;
    nd = master_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        num_alive_nodes++;
        if (clnt_mngetset(nd->nd_nodename, sp->setname,
            MD_SET_BAD, &mnsr, ep) == -1) {
            if (mdiserror(ep, MDE_NO_SET)) {
                /* set doesn't exist, mark node as DEL */
                nd->nd_flags &= ~MD_MN_NODE_OK;
                nd->nd_flags &= ~MD_MN_NODE_ADD;
                nd->nd_flags |= MD_MN_NODE_DEL;
                nd->nd_flags |= MD_MN_NODE_NOSET;
                nd = nd->nd_next;
                continue;
            } else {
                /* If RPC failure to another node return 205 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    rval = 205;
                } else {
                    /* Any other failure */
                    rval = -1;
                }
                goto out;
            }
        }
        /* Find biggest genid in records for this diskset */
        if (mnsr->sr_genid > max_genid)
            max_genid = mnsr->sr_genid;

        dr = mnsr->sr_drivechain;
        while (dr) {
            /* Find biggest genid in records for this diskset */
            if (dr->dr_genid > max_genid) {
                max_genid = dr->dr_genid;
            }
            dr = dr->dr_next;
        }

        found_my_nr = 0;
        nr = mnsr->sr_nodechain;
        /* nr is the list of node recs from nd_nodename node */
        while (nr) {
            /* Find biggest genid in records for this diskset */
            if (nr->nr_genid > max_genid)
                max_genid = nr->nr_genid;
            nd2 = master_nodelist;
            ndtail = NULL;
            /* For each node record, is it in master list? */
            while (nd2) {
                if (nd2->nd_nodeid == nr->nr_nodeid)
                    break;
                if (nd2->nd_next == NULL)
                    ndtail = nd2;
                nd2 = nd2->nd_next;
            }
            /*
             * Found node record not in master list -- add it
             * to list marking it as DEL since node record
             * should exist on all nodes unless a panic occurred
             * during addition or deletion of host to diskset.
             */
            if (nd2 == NULL) {
                nd2 = Zalloc(sizeof (*nd2));
                (void) strcpy(nd2->nd_nodename,
                    nr->nr_nodename);
                nd2->nd_flags = nr->nr_flags;
                nd2->nd_flags |= MD_MN_NODE_DEL;
                nd2->nd_nodeid = nr->nr_nodeid;
                nd2->nd_next = NULL;
                ndtail->nd_next = nd2;
                nd2 = NULL;
                nr = nr->nr_next;
                continue;
            }
            /*
             * Is this the node record for the node that
             * we requested the set desc from?
             * If so, check if node has its own node record
             * marked OK. If marked OK, check for the OWN bit.
             */
            if (nr->nr_nodeid == nd->nd_nodeid) {
                found_my_nr = 1;
                if (nr->nr_flags & MD_MN_NODE_OK) {
                    /*
                     * If node record is marked OK
                     * on its own node, then mark it OK
                     * in the master list.  Node record
                     * would have to exist on all nodes
                     * in the ADD state before it could
                     * be put into the OK state.
                     */
                    nd->nd_flags |= MD_MN_NODE_OK;
                    nd->nd_flags &=
                        ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
                    /*
                     * Mark own in master list as marked
                     * on own node.
                     */
                    if (nr->nr_flags & MD_MN_NODE_OWN)
                        nd->nd_flags |= MD_MN_NODE_OWN;
                    else
                        nd->nd_flags &= ~MD_MN_NODE_OWN;
                } else {
                    /* Otherwise, mark node as DEL */
                    nd->nd_flags &= ~MD_MN_NODE_OK;
                    nd->nd_flags &= ~MD_MN_NODE_ADD;
                    nd->nd_flags |= MD_MN_NODE_DEL;
                }
            }
            /*
             * If node is not ALIVE and marked DEL
             * on any node, make it DEL in master list.
             * If node is not ALIVE and marked ADD
             * on any node, make it ADD in master list
             * unless node record has already been marked DEL.
             */
            if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
                if (nr->nr_flags & MD_MN_NODE_ADD) {
                    if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
                        /* If not DEL - mark it ADD */
                        nd->nd_flags |= MD_MN_NODE_ADD;
                        nd->nd_flags &= ~MD_MN_NODE_OK;
                    }
                }
                if (nr->nr_flags & MD_MN_NODE_DEL) {
                    nd->nd_flags |= MD_MN_NODE_DEL;
                    nd->nd_flags &= ~MD_MN_NODE_OK;
                    /* Could already be ADD - make it DEL */
                    nd->nd_flags &= ~MD_MN_NODE_ADD;
                }
            }
            nr = nr->nr_next;
        }
        /*
         * If a node record doesn't exist on its own node,
         * then mark node as DEL.
         */
        if (found_my_nr == 0) {
            nd->nd_flags &= ~MD_MN_NODE_OK;
            nd->nd_flags |= MD_MN_NODE_DEL;
        }

        /*
         * If node is OK - put mnsr onto master_mnsr_node list for
         * later use when syncing up the drive records in the set.
         */
        if (nd->nd_flags & MD_MN_NODE_OK) {
            mnsr_node = Zalloc(sizeof (*mnsr_node));
            mnsr_node->mmn_mnsr = mnsr;
            (void) strncpy(mnsr_node->mmn_nodename,
                nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
            mnsr_node->mmn_next = master_mnsr_node;
            master_mnsr_node = mnsr_node;
        } else {
            free_sr((struct md_set_record *)mnsr);
        }

        nd = nd->nd_next;
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Master nodelist created for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    /*
     * Send master nodelist to the rpc.metad on all nodes (including
     * myself) and each node will update itself.  This will set the
     * ADD and DEL flags on each node as setup in the master nodelist.
     * Don't send nodelist to node where set doesn't exist.
     */
    nd = master_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
            (nd->nd_flags & MD_MN_NODE_NOSET)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_upd_nr_flags(nd->nd_nodename, sp,
            master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
            /* If RPC failure to another node return 205 */
            if ((mdanyrpcerror(ep)) &&
                (sd->sd_mn_mynode->nd_nodeid !=
                nd->nd_nodeid)) {
                rval = 205;
            } else {
                /* Any other failure */
                rval = -1;
            }
            goto out;
        }
        nd = nd->nd_next;
    }

    /*
     * Now, delete nodes that need to be deleted.
     */
    if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep))  == NULL) {
        if (! mdisok(ep)) {
            rval = -1;
            goto out;
        }
    }

    /*
     * May be doing lots of RPC commands to the nodes, so lock the
     * ALIVE members of the set since most of the rpc.metad routines
     * require this for security reasons.
     */
    nd = master_nodelist;
    while (nd) {
        /* Skip non-alive nodes and node without set */
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
            (nd->nd_flags & MD_MN_NODE_NOSET)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
            /* If RPC failure to another node return 205 */
            if ((mdanyrpcerror(ep)) &&
                (sd->sd_mn_mynode->nd_nodeid !=
                nd->nd_nodeid)) {
                rval = 205;
            } else {
                /* Any other failure */
                rval = -1;
            }
            goto out;
        }
        set_locked = 1;
        nd = nd->nd_next;
    }

    nd = master_nodelist;
    while (nd) {
        /* Skip non-alive nodes */
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (nd->nd_flags & MD_MN_NODE_DEL) {
            num_alive_nodes_del++;
            /*
             * Delete this node rec from all ALIVE nodes in diskset.
             */
            nd2 = master_nodelist;
            while (nd2) {
                /* Skip non-alive nodes and node without set */
                if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
                    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
                    nd2 = nd2->nd_next;
                    continue;
                }

                /* This is a node being deleted from set */
                if (nd2->nd_nodeid == nd->nd_nodeid) {
                    /* Mark set record as DEL */
                    if (clnt_upd_sr_flags(nd->nd_nodename,
                        sp, MD_SR_DEL, ep)) {
                        /* RPC failure to !my node */
                        if ((mdanyrpcerror(ep)) &&
                            (sd->sd_mn_mynode->
                            nd_nodeid
                            != nd->nd_nodeid)) {
                            rval = 205;
                        } else {
                            /* Any other failure */
                            rval = -1;
                        }
                        goto out;
                    }
                    if (clnt_deldrvs(nd->nd_nodename, sp,
                        dd, ep)) {
                        /* RPC failure to !my node */
                        if ((mdanyrpcerror(ep)) &&
                            (sd->sd_mn_mynode->
                            nd_nodeid
                            != nd->nd_nodeid)) {
                            rval = 205;
                        } else {
                            /* Any other failure */
                            rval = -1;
                        }
                        goto out;
                    }
                    if (clnt_delset(nd->nd_nodename, sp,
                        ep) == -1) {
                        /* RPC failure to !my node */
                        if ((mdanyrpcerror(ep)) &&
                            (sd->sd_mn_mynode->
                            nd_nodeid
                            != nd->nd_nodeid)) {
                            rval = 205;
                        } else {
                            /* Any other failure */
                            rval = -1;
                        }
                        goto out;
                    }
                } else {
                    /*
                     * Delete host from sets on hosts
                     * not being deleted.
                     */
                    anode[0] = Strdup(nd->nd_nodename);
                    if (clnt_delhosts(nd2->nd_nodename, sp,
                        1, anode, ep) == -1) {
                        Free(anode[0]);
                        /* RPC failure to !my node */
                        if ((mdanyrpcerror(ep)) &&
                            (sd->sd_mn_mynode->
                            nd_nodeid
                            != nd2->nd_nodeid)) {
                            rval = 205;
                        } else {
                            /* Any other failure */
                            rval = -1;
                        }
                        goto out;
                    }

                    meta_mc_log(MC_LOG5,
                        dgettext(TEXT_DOMAIN,
                        "Deleted node %s (%d) on node %s "
                        "from set %s: %s"),
                        nd->nd_nodename, nd->nd_nodeid,
                        nd2->nd_nodename,
                        sp->setname,
                        meta_print_hrtime(
                        gethrtime() - start_time));

                    Free(anode[0]);
                }
                nd2 = nd2->nd_next;
            }
        }
        nd = nd->nd_next;
    }

    nd = master_nodelist;
    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    while (nd) {
        /* Skip non-alive nodes and node without set */
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
            (nd->nd_flags & MD_MN_NODE_NOSET)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
            /* If RPC failure to another node return 205 */
            if ((mdanyrpcerror(ep)) &&
                (sd->sd_mn_mynode->nd_nodeid !=
                nd->nd_nodeid)) {
                rval = 205;
            } else {
                /* Any other failure */
                rval = -1;
            }
            goto out;
        }
        nd = nd->nd_next;
    }
    cl_set_setkey(NULL);
    set_locked = 0;

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Nodelist syncronization complete for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    metaflushsetname(sp);

    /*
     * If all alive nodes have been deleted from set, just
     * return since nothing else can be done until non-alive
     * nodes (if there are any) rejoin the cluster.
     */
    if (num_alive_nodes == num_alive_nodes_del) {
        rval = 0;
        goto out;
    }

    /*
     * Sync up drive records.
     *
     * If a node panic'd (or metaset command was killed) during the
     * addition or deletion of a drive to the diskset, the nodes
     * may have a different view of the drive list.  During cleanup
     * of the drive list during reconfig, a drive will be deleted
     * from the list if the master node sees that the drive has been
     * marked in the ADD state on any node or is marked in the DEL state
     * on all nodes.
     * This cleanup must occur even if all nodes in the cluster are
     * not part of the cluster so that all nodes have the same view
     * of the drivelist.
     * Then if the entire cluster goes down and comes back up, the
     * new master node could be a node that wasn't in the cluster when
     * the node was deleted.  This could lead to a situation where the
     * master node thinks that a drive is OK, but this drive isn't
     * known to the other nodes.
     * This situation can also occur during the addition of a drive
     * where a node has the drive marked OK, but the node executing the
     * metaset command enountered a failure before marking that drive OK
     * on the rest of the nodes.  If the node with the OK drive then
     * panics, then rest of the nodes will remove that drive marked ADD
     * and when the node with the OK drive rejoins the cluster, it will
     * have a drive marked OK that is unknown by the other nodes.
     *
     * There are 2 situations to consider:
     * A) Master knows about a drive that other nodes don't know about.
     * B) At least one slave node knows about a drive that the master
     *    node doesn't know about.
     *
     * To handle these situations the following steps are followed:
     * 1) Count number of drives known by this master node and the
     *    other slave nodes.
     *    If all nodes have the same number of drives and the master has
     *    all drives marked OK, then skip to step4.
     *
     * 2) If a node has less drives listed than the master, the master
     *    must get the drive descriptor list from that node so that
     *    master can determine which drive it needs to delete from that
     *    node.  Master must get the drive descriptor list since the
     *    drive record list does not contain the name of the drive, but
     *    only a key and the key can only be interprested on that other
     *    node.
     *
     * 3) The master will then create the master drive list by doing:
     *  - Master starts with drive list known by master.
     *  - Any drive marked ADD will be removed from the list.
     *  - Any drive not known by another node (from step2) will be
     *  removed from the drive list.
     *  - If a drive is marked DEL on the master, the master must
     *  verify that the drive record is marked DEL on all nodes.
     *  If any node has the drive record marked OK, mark it OK
     *  on the master.  (The reason why is described below).
     *
     * 4) The master sends out the master drive list and the slave
     *    nodes will force their drive lists to match the master
     *    drive list by deleting drives, if necessary and by changing
     *    the drive record states from ADD->OK if master has drive
     *    marked OK and slave has drive marked ADD.
     *
     * Interesting scenarios:
     *
     * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
     *    to delete a drive record (drive record on node 1 is marked DEL),
     *    but is stopped when node 3 panics.  Node 1 also panics.
     *    During reconfig cycle, node 2 is picked as master and the drive
     *    record is left alone since all nodes in the cluster have it
     *    marked OK.  User now sees drive as part of diskset.
     *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
     *    Node 1 is picked as the master and node 1 has drive record
     *    marked DEL.  Node 1 contacts all other nodes in the cluster
     *    and since at least one node has the drive record marked OK,
     *    the master marks the drive record OK.
     *    User continues to see the drive as part of the diskset.
     */

    /* Reget set descriptor since flushed above */
    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        rval = -1;
        goto out;
    }

    /* Has side effect of setting sd->sd_drvs to same as master_dd */
    if ((master_dd = metaget_drivedesc_sideno(sp,
        sd->sd_mn_mynode->nd_nodeid,
        (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
        /* No drives in list */
        if (!mdisok(ep)) {
            /*
             * Can't get drive list for this node, so
             * return -1 causing this node to be removed
             * cluster config and fixed.
             */
            rval = -1;
            goto out;
        }
    }

    /* Count the number of drives for all nodes */
    mnsr_node = master_mnsr_node;
    while (mnsr_node) {
        dr_cnt = 0;
        dr = mnsr_node->mmn_mnsr->sr_drivechain;
        while (dr) {
            dr_cnt++;
            dr = dr->dr_next;
        }
        mnsr_node->mmn_numdrives = dr_cnt;
        mnsr_node = mnsr_node->mmn_next;
    }

    /* Count the number of drives for the master; also check flags */
    all_drives_ok = 1;
    dd_cnt = 0;
    dd = master_dd;
    while (dd) {
        dd_cnt++;
        if (!(dd->dd_flags & MD_DR_OK))
            all_drives_ok = 0;
        dd = dd->dd_next;
    }

    /* If all drives are ok, do quick check against number of drives */
    if (all_drives_ok) {
        /* If all nodes have same number of drives, almost done */
        mnsr_node = master_mnsr_node;
        while (mnsr_node) {
            if (mnsr_node->mmn_numdrives != dd_cnt)
                break;
            mnsr_node = mnsr_node->mmn_next;
        }
        /* All nodes have same number of drives, just send flags */
        if (mnsr_node == NULL) {
            goto send_drive_list;
        }
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Begin detailed drive synchronization for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    /* Detailed check required  */
    mnsr_node = master_mnsr_node;
    while (mnsr_node) {
        /* Does slave node have less drives than master? */
        if (mnsr_node->mmn_numdrives < dd_cnt) {
            /* Yes - must determine which drive is missing */
            if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
                &other_dd, ep)) {
                /* RPC failure to !my node */
                if ((mdanyrpcerror(ep)) &&
                    (strcmp(mynode(), mnsr_node->mmn_nodename)
                    != 0)) {
                    rval = 205;
                } else {
                    /* Any other failure */
                    rval = -1;
                }
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Master node %s unable to "
                    "retrieve drive list from node %s"),
                    mynode(), mnsr_node->mmn_nodename);
                goto out;
            }
            mnsr_node->mmn_dd = other_dd;
            dd = master_dd;
            while (dd) {
                if (!(dd->dd_flags & MD_DR_OK)) {
                    dd = dd->dd_next;
                    continue;
                }
                other_dd = mnsr_node->mmn_dd;
                while (other_dd) {
                    /* Convert to devids, when available */
                    if (strcmp(other_dd->dd_dnp->cname,
                        dd->dd_dnp->cname) == 0) {
                        break;
                    }
                    other_dd = other_dd->dd_next;
                }
                /*
                 * dd not found on slave so mark it
                 * ADD for later deletion (drives in ADD
                 * state are deleted later in this routine).
                 */
                if (other_dd == NULL) {
                    dd->dd_flags = MD_DR_ADD;
                }
                dd = dd->dd_next;
            }

        }
        mnsr_node = mnsr_node->mmn_next;
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Drive check completed for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    dd = master_dd;
    dd_prev = 0;
    while (dd) {
        /* Remove any ADD drives from list */
        if (dd->dd_flags & MD_DR_ADD) {
            if (dd_prev) {
                dd_prev->dd_next = dd->dd_next;
                dd->dd_next = NULL;
                metafreedrivedesc(&dd);
                dd = dd_prev->dd_next;
            } else {
                /*
                 * If removing drive descriptor from head
                 * of linked list, also change sd->sd_drvs.
                 */
                master_dd = sd->sd_drvs = dd->dd_next;
                dd->dd_next = NULL;
                metafreedrivedesc(&dd);
                dd = master_dd;
            }
            /* dd setup in if/else above */
            continue;
        }
        /*
         * If drive is marked DEL, check all other nodes.
         * If drive on another node is marked OK, mark drive OK
         * in master list.  If drive is marked DEL or doesn't exist
         * on all nodes, remove drive from list.
         */
        if (dd->dd_flags & MD_DR_DEL) {
            mnsr_node = master_mnsr_node;
            while (mnsr_node) {
                if (mnsr_node->mmn_dd == NULL) {
                    if (clnt_getdrivedesc(
                        mnsr_node->mmn_nodename, sp,
                        &other_dd, ep)) {
                        /* RPC failure to !my node */
                        if ((mdanyrpcerror(ep)) &&
                            (strcmp(mynode(),
                            mnsr_node->mmn_nodename)
                            != 0)) {
                            rval = 205;
                        } else {
                            /* Any other failure */
                            rval = -1;
                        }
                        mde_perror(ep,
                            dgettext(TEXT_DOMAIN,
                            "Master node %s unable "
                            "to retrieve drive list "
                            "from node %s"), mynode(),
                            mnsr_node->mmn_nodename);
                        goto out;
                    }
                    mnsr_node->mmn_dd = other_dd;
                }
                other_dd = mnsr_node->mmn_dd;
                while (other_dd) {
                    /* Found drive (OK) from other node */
                    if (strcmp(dd->dd_dnp->cname,
                        other_dd->dd_dnp->cname)
                        == 0) {
                        /* Drive marked OK */
                        if (other_dd->dd_flags &
                            MD_DR_OK) {
                            dd->dd_flags = MD_DR_OK;
                        }
                        break;
                    }
                    other_dd = other_dd->dd_next;
                }
                if (dd->dd_flags == MD_DR_OK)
                    break;

                mnsr_node = mnsr_node->mmn_next;
            }
            /*
             * If no node had this drive marked OK, delete it.
             */
            if (dd->dd_flags & MD_DR_DEL) {
                if (dd_prev) {
                    dd_prev->dd_next = dd->dd_next;
                    dd->dd_next = NULL;
                    metafreedrivedesc(&dd);
                    dd = dd_prev->dd_next;
                } else {
                    /*
                     * If removing drive descriptor from
                     * head of linked list, also change
                     * sd->sd_drvs.
                     */
                    master_dd = sd->sd_drvs = dd->dd_next;
                    dd->dd_next = NULL;
                    metafreedrivedesc(&dd);
                    dd = master_dd;
                }
                /* dd setup in if/else above */
                continue;
            }
        }
        dd_prev = dd;
        dd = dd->dd_next;
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Setting drive states completed for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

send_drive_list:
    /*
     * Set genid on all drives to be the highest value seen.
     */
    dd = master_dd;
    while (dd) {
        dd->dd_genid = max_genid;
        dd = dd->dd_next;
    }
    /*
     * Send updated drive list to all alive nodes.
     * Will also set genid on set and node records to have same
     * as the drive records.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        /* Skip non-alive nodes */
        if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
            /* RPC failure to another node */
            if ((mdanyrpcerror(ep)) &&
                (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
                rval = 205;
            } else {
                /* Any other failure */
                rval = -1;
            }
            goto out;
        }
        nd = nd->nd_next;
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Sent drive list to all nodes for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    /*
     * If no drive records left in set and nodes had been joined,
     * withdraw the nodes.  Always reset the master and mark
     * all nodes as withdrawn on all nodes.
     */
    if (master_dd == NULL) {
        /* Reset new master flag since no longer master */
        (void) memset(&sf, 0, sizeof (sf));
        sf.sf_setno = sp->setno;
        sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
        sf.sf_flags = MDDB_NM_RESET;
        /* Use magic to help protect ioctl against attack. */
        sf.sf_magic = MDDB_SETFLAGS_MAGIC;
        /* Ignore failure, failure to reset flag isn't catastrophic */
        (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
            &sf.sf_mde, NULL);

        meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
            "Reset new master flag for " "set %s: %s"),
            sp->setname, meta_print_hrtime(gethrtime() - start_time));

        nd = sd->sd_nodelist;
        while (nd) {
            /* Skip non-alive nodes  */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }

            if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
                /* RPC failure to another node */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    rval = 205;
                } else {
                    /* Any other failure */
                    rval = -1;
                }
                goto out;
            }
            set_locked = 1;

            /* Withdraw node from set if owner */
            if ((nd->nd_flags & MD_MN_NODE_OWN) &&
                (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
                /* RPC failure to another node */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    rval = 205;
                } else {
                    /* Any other failure */
                    rval = -1;
                }
                goto out;
            }

            /* Mark all nodes as withdrawn on this node */
            if (clnt_upd_nr_flags(nd->nd_nodename, sp,
                sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
                /* RPC failure to another node */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    rval = 205;
                } else {
                    /* Any other failure */
                    rval = -1;
                }
                goto out;
            }

            /* Resets master to no-master on this node */
            if (clnt_mnsetmaster(nd->nd_nodename, sp,
                "", MD_MN_INVALID_NID, ep)) {
                /* RPC failure to another node */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    rval = 205;
                } else {
                    /* Any other failure */
                    rval = -1;
                }
                goto out;
            }

            cl_sk = cl_get_setkey(sp->setno, sp->setname);
            if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
                /* RPC failure to another node */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    rval = 205;
                } else {
                    /* Any other failure */
                    rval = -1;
                }
                goto out;
            }
            set_locked = 0;
            nd = nd->nd_next;
        }
    }

out:
    /*
     * If got here and set is still locked, then an error has
     * occurred and master_nodelist is still valid.
     * If error is not an RPC error, then unlock.
     * If error is an RPC error, skip unlocks since this could cause
     * yet another RPC timeout if a node has failed.
     * Ignore failures in unlock since unlock is just trying to
     * clean things up.
     */
    if ((set_locked) && !(mdanyrpcerror(ep))) {
        nd = master_nodelist;
        cl_sk = cl_get_setkey(sp->setno, sp->setname);
        while (nd) {
            /* Skip non-alive nodes */
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                nd = nd->nd_next;
                continue;
            }
            /*
             * If clnt_unlock fails, just break out since next
             * reconfig cycle will reset the locks anyway.
             */
            if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
                break;
            }
            nd = nd->nd_next;
        }
        cl_set_setkey(NULL);
    }
    /* Free master_mnsr and drive descs */
    mnsr_node = master_mnsr_node;
    while (mnsr_node) {
        master_mnsr_node = mnsr_node->mmn_next;
        free_sr((md_set_record *)mnsr_node->mmn_mnsr);
        free_rem_dd(mnsr_node->mmn_dd);
        Free(mnsr_node);
        mnsr_node = master_mnsr_node;
    }

    /* Frees sd->sd_drvs (which is also master_dd) */
    metaflushsetname(sp);
    return (rval);
}

/*
 * meta_mnsync_diskset_mddbs
 * Calling node is guaranteed to be an owner node.
 * Calling node is the master node.
 *
 * Master node verifies that ondisk mddb format matches its incore format.
 * If no nodes are joined to set, remove the change log entries.
 * If a node is joined to set, play the change log.
 *
 * Returns   0 - Success
 *       1 - Master unable to join to set.
 *      205 - Failure during RPC to another node
 *      -1 - Any other failure and ep is filled in.
 *          -1 return will eventually cause node to panic
 *          in a SunCluster environment.
 */
int
meta_mnsync_diskset_mddbs(
    mdsetname_t *sp,
    md_error_t  *ep
)
{
    md_set_desc     *sd;
    mddb_config_t       c;
    md_mn_msgclass_t    class;
    mddb_setflags_config_t  sf;
    md_mnnode_desc      *nd, *nd2;
    md_error_t      xep = mdnullerror;
    int         stale_set = 0;

    /* If setname is there, set desc should exist. */
    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        mde_perror(ep, dgettext(TEXT_DOMAIN,
            "Unable to get set %s desc information"), sp->setname);
        return (-1);
    }

    /* Are there drives in the set? */
    if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep) == NULL) {
        if (! mdisok(ep)) {
            return (-1);
        }
        /* No drives in set -- nothing to sync up */
        return (0);
    }

    /*
     * Is master node (which is this node) joined to set?
     * If master node isn't joined (which means that no nodes
     * are joined to diskset), remove the change log entries
     * since no need to replay them - all nodes will have same
     * view of mddbs since all nodes are reading in the mddbs
     * from disk.
     * There is also no need to sync up the master and ondisk mddbs
     * since master has no incore knowledge.
     * Need to join master to set in order to flush the change
     * log entries. Don't need to block I/O during join of master
     * to set since no other nodes are joined to set and so no I/O
     * can be occurring.
     */
    if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
        /* Join master to set */
        if (clnt_joinset(mynode(), sp,
            MNSET_IN_RECONFIG, ep)) {
            if (mdismddberror(ep, MDE_DB_STALE)) {
                /*
                 * If STALE, print message and continue on.
                 * Don't do any writes or reads to mddbs
                 * so don't clear change log.
                 */
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Join of master node to STALE set %s"),
                    sp->setname);
                stale_set = 1;
                mdclrerror(ep);
            } else if (mdismddberror(ep, MDE_DB_ACCOK)) {
                /* ACCOK means mediator provided extra vote */
                mdclrerror(ep);
            } else {
                /*
                 * If master is unable to join set, print an
                 * error message.  Don't return failure or node
                 * will panic during cluster reconfig cycle.
                 * Also, withdraw node from set in order to
                 * cleanup from failed join attempt.
                 */
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Join of master node in set %s failed"),
                    sp->setname);
                if (clnt_withdrawset(mynode(), sp, &xep))
                    mdclrerror(&xep);
                return (1);
            }
        }
        /*
         * Master node successfully joined.
         * Set local copy of flags to OWN and
         * send owner flag to rpc.metad. If not stale,
         * flush the change log.
         */
        sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
        if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
            MNSET_IN_RECONFIG, ep)) {
            mde_perror(ep, dgettext(TEXT_DOMAIN,
                "Flag update of master node join in set %s failed"),
                sp->setname);
            return (-1);
        }

        if (!stale_set) {
            if (mdmn_reset_changelog(sp, ep,
                MDMN_CLF_RESETLOG) != 0) {
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to reset changelog."));
                return (-1);
            }
            meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
                "Removed changelog entries for set %s: %s"),
                sp->setname,
                meta_print_hrtime(gethrtime() - start_time));
        }
        /* Reset new master flag before return */
        (void) memset(&sf, 0, sizeof (sf));
        sf.sf_setno = sp->setno;
        sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
        sf.sf_flags = MDDB_NM_RESET;
        /* Use magic to help protect ioctl against attack. */
        sf.sf_magic = MDDB_SETFLAGS_MAGIC;
        /* Ignore failure, failure to reset flag isn't catastrophic */
        (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
            &sf.sf_mde, NULL);

        meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
            "Reset new master flag for set %s: %s"),
            sp->setname, meta_print_hrtime(gethrtime() - start_time));

        return (0);
    }

    /*
     * Is master already joined to STALE set (< 50% mddbs avail)?
     * If so, can make no config changes to mddbs so don't check or play
     * changelog and don't sync master node to ondisk mddbs.
     * To get out of the stale state all nodes must be withdrawn
     * from set.  Then as nodes are re-joined, all nodes will
     * have same view of mddbs since all nodes are reading the
     * mddbs from disk.
     */
    (void) memset(&c, 0, sizeof (c));
    c.c_id = 0;
    c.c_setno = sp->setno;
    if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
        (void) mdstealerror(ep, &c.c_mde);
        return (-1);
    }
    if (c.c_flags & MDDB_C_STALE) {
        return (0);
    }

    /*
     * If this node is NOT a newly chosen master, then there's
     * nothing else to do since the change log should be empty and
     * the ondisk and incore mddbs are already consistent.
     *
     * A newly chosen master is a node that was not the master
     * at the beginning of the reconfig cycle.  If a node is a new
     * master, then the new master state is reset after the ondisk
     * and incore mddbs are consistent and the change log has
     * been replayed.
     */
    (void) memset(&sf, 0, sizeof (sf));
    sf.sf_setno = sp->setno;
    sf.sf_flags = MDDB_NM_GET;
    /* Use magic to help protect ioctl against attack. */
    sf.sf_magic = MDDB_SETFLAGS_MAGIC;
    if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
        ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
        return (0);
    }

    /*
     * Now, sync up incore master view to ondisk mddbs.
     * This is needed in the case where a master node
     * had made a change to the mddb, but this change
     * may not have been relayed to the slaves yet.
     * So, the new master needs to verify that the ondisk
     * mddbs match what the new master has incore -
     * if different, new master rewrites all of the mddbs.
     * Then the new master will replay the changelog and the
     * new master will then execute what the old master had
     * done.
     *
     * Block all I/Os to disks in this diskset on all nodes in
     * the diskset.  This will allow the rewriting of the mddbs
     * (if needed), to proceed in a timely manner.
     *
     * If block of I/Os fail, return a -1.
     */

    nd = sd->sd_nodelist;
    while (nd) {
        /* Skip non-alive and non-owner nodes  */
        if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
            (!(nd->nd_flags & MD_MN_NODE_OWN))) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
            MN_SUSP_IO, ep)) {
            mde_perror(ep, dgettext(TEXT_DOMAIN,
                "Unable to suspend I/O on node %s in set %s"),
                nd->nd_nodename, sp->setname);

            /*
             * Resume all other nodes that had been suspended.
             * (Reconfig return step also resumes I/Os
             * for all sets.)
             */
            nd2 = sd->sd_nodelist;
            while (nd2) {
                /* Stop when reaching failed node */
                if (nd2->nd_nodeid == nd->nd_nodeid)
                    break;
                /* Skip non-alive and non-owner nodes  */
                if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
                    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
                    nd2 = nd2->nd_next;
                    continue;
                }
                (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
                    sp->setno, MN_RES_IO, &xep));
                nd2 = nd2->nd_next;
            }

            /*
             * If an RPC failure on another node, return a 205.
             * Otherwise, exit with failure.
             */
            if ((mdanyrpcerror(ep)) &&
                (sd->sd_mn_mynode->nd_nodeid !=
                nd->nd_nodeid)) {
                return (205);
            } else {
                return (-1);
            }

        }
        nd = nd->nd_next;
    }

    (void) memset(&c, 0, sizeof (c));
    c.c_id = 0;
    c.c_setno = sp->setno;
    /* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
    if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
        return (-1);

    /*
     * Resume I/Os that were suspended above.
     */
    nd = sd->sd_nodelist;
    while (nd) {
        /* Skip non-alive and non-owner nodes  */
        if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
            (!(nd->nd_flags & MD_MN_NODE_OWN))) {
            nd = nd->nd_next;
            continue;
        }
        if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
            MN_RES_IO, ep)) {
            mde_perror(ep, dgettext(TEXT_DOMAIN,
                "Unable to resume I/O on node %s in set %s"),
                nd->nd_nodename, sp->setname);

            /*
             * If an RPC failure then don't do any
             * more RPC calls, since one timeout is enough
             * to endure.  If RPC failure to another node, return
             * 205.  If RPC failure to my node, return -1.
             * If not an RPC failure, continue resuming the
             * rest of the nodes and then return -1.
             */
            if (mdanyrpcerror(ep)) {
                if (sd->sd_mn_mynode->nd_nodeid ==
                    nd->nd_nodeid) {
                    return (-1);
                } else {
                    return (205);
                }
            }

            /*
             * If not an RPC error, continue resuming rest of
             * nodes, ignoring any failures except for an
             * RPC failure which constitutes an immediate exit.
             * Start in middle of list with failing node.
             */
            nd2 = nd->nd_next;
            while (nd2) {
                /* Skip non-alive and non-owner nodes  */
                if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
                    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
                    nd2 = nd2->nd_next;
                    continue;
                }
                (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
                    sp->setno, MN_RES_IO, &xep));
                if (mdanyrpcerror(&xep)) {
                    return (-1);
                }
                nd2 = nd2->nd_next;
            }
        }
        nd = nd->nd_next;
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
        "checking/writing the mddb for set %s: %s"), sp->setname,
        meta_print_hrtime(gethrtime() - start_time));

    /*
     * Send (aka replay) all messages we find in the changelog.
     * Flag the messages with
     *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
     *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
     */
    for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
        mdmn_changelog_record_t *lr;
        md_error_t  xep = mdnullerror;
        md_mn_result_t  *resultp = NULL;
        int     ret;

        lr = mdmn_get_changelogrec(sp->setno, class);
        if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
            /* no entry for this class */
            continue;
        }

        meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
            "replaying message ID=(%d, 0x%llx-%d)\n"),
            MSGID_ELEMS(lr->lr_msg.msg_msgid));

        ret = mdmn_send_message_with_msgid(
            lr->lr_msg.msg_setno,
            lr->lr_msg.msg_type,
            lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
            MD_MSGF_OVERRIDE_SUSPEND,
            lr->lr_msg.msg_recipient,
            lr->lr_msg.msg_event_data,
            lr->lr_msg.msg_event_size,
            &resultp,
            &lr->lr_msg.msg_msgid,
            &xep);

        meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
            "mdmn_send_message returned %d\n"), ret);

        if (resultp)
            free_result(resultp);
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Playing changelog completed for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    /*
     * Now that new master has ondisk and incore mddbs in sync, reset
     * this node's new master kernel flag (for this set).  If this node
     * re-enters another reconfig cycle before the completion of this
     * reconfig cycle, this master node won't need to check if the ondisk
     * and incore mddbs are in sync since this node won't be considered
     * a new master (since this flag is being reset here in the middle of
     * step2).  This will save time during any subsequent reconfig
     * cycles as long as this node continues to be master.
     */
    (void) memset(&sf, 0, sizeof (sf));
    sf.sf_setno = sp->setno;
    sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
    sf.sf_flags = MDDB_NM_RESET;
    /* Use magic to help protect ioctl against attack. */
    sf.sf_magic = MDDB_SETFLAGS_MAGIC;
    /* Ignore failure, since failure to reset flag isn't catastrophic */
    (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Reset new master flag for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    return (0);
}

/*
 * meta_mnjoin_all will join all starting nodes in the diskset.
 * A starting node is considered to be any node that is not
 * an owner of the set but is a member of the cluster.
 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
 *
 * Caller is the Master node.
 *
 * Returns   0 - Success
 *      205 - Failure during RPC to another node
 *      -1 - Any other failure and ep is filled in.
 */
int
meta_mnjoin_all(
    mdsetname_t *sp,
    md_error_t  *ep
)
{
    md_set_desc     *sd;
    md_mnnode_desc      *nd, *nd2;
    int         rval = 0;
    int         stale_flag = 0;
    mddb_config_t       c;
    int         susp_res_flag = 0;
    md_error_t      xep = mdnullerror;

    /* If setname is there, set desc should exist. */
    if ((sd = metaget_setdesc(sp, ep)) == NULL) {
        mde_perror(ep, dgettext(TEXT_DOMAIN,
            "Unable to get set %s desc information"), sp->setname);
        return (-1);
    }

    /* Are there drives in the set? */
    if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep) == NULL) {
        if (! mdisok(ep)) {
            return (-1);
        }
        /* No drives in set -- nothing to join */
        return (0);
    }

    /*
     * Is set currently stale?
     */
    (void) memset(&c, 0, sizeof (c));
    c.c_id = 0;
    c.c_setno = sp->setno;
    /* Ignore failure since master node may not be joined yet */
    (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
    if (c.c_flags & MDDB_C_STALE) {
        stale_flag = MNSET_IS_STALE;
    }

    /*
     * If any nodes are going to be joined to diskset, then
     * suspend I/O to all disks in diskset so that nodes can join
     * (read in mddbs) in a reasonable amount of time even under
     * high I/O load.  Don't need to do this if set is STALE since
     * no I/O can be occurring to a STALE set.
     */
    if (stale_flag != MNSET_IS_STALE) {
        nd = sd->sd_nodelist;
        while (nd) {
            /* Found a node that will be joined to diskset */
            if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
                (!(nd->nd_flags & MD_MN_NODE_OWN))) {
                /* Set flag that diskset should be suspended */
                susp_res_flag = 1;
                break;
            }
            nd = nd->nd_next;
        }
    }

    if (susp_res_flag) {
        /*
         * Block all I/Os to disks in this diskset on all joined
         * nodes in the diskset.
         * If block of I/Os fails due to an RPC failure on another
         * node, return 205; otherwise, return -1.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* Skip non-alive and non-owner nodes  */
            if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
                (!(nd->nd_flags & MD_MN_NODE_OWN))) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
                MN_SUSP_IO, ep)) {
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to suspend I/O on node %s"
                    " in set %s"), nd->nd_nodename,
                    sp->setname);
                /*
                 * Resume other nodes that had been suspended.
                 * (Reconfig return step also resumes I/Os
                 * for all sets.)
                 */
                nd2 = sd->sd_nodelist;
                while (nd2) {
                    /* Stop when reaching failed node */
                    if (nd2->nd_nodeid == nd->nd_nodeid)
                        break;
                    /* Skip non-alive/non-owner nodes  */
                    if ((!(nd2->nd_flags &
                        MD_MN_NODE_ALIVE)) ||
                        (!(nd2->nd_flags &
                        MD_MN_NODE_OWN))) {
                        nd2 = nd2->nd_next;
                        continue;
                    }
                    (void) (clnt_mn_susp_res_io(
                        nd2->nd_nodename, sp->setno,
                        MN_RES_IO, &xep));
                    nd2 = nd2->nd_next;
                }

                /*
                 * If the suspend failed due to an
                 * RPC failure on another node, return
                 * a 205.
                 * Otherwise, exit with failure.
                 * The return reconfig step will resume
                 * I/Os for all disksets.
                 */
                if ((mdanyrpcerror(ep)) &&
                    (sd->sd_mn_mynode->nd_nodeid !=
                    nd->nd_nodeid)) {
                    return (205);
                } else {
                    return (-1);
                }
            }
            nd = nd->nd_next;
        }
    }

    nd = sd->sd_nodelist;
    while (nd) {
        /*
         * If a node is in the membership list but isn't joined
         * to the set, try to join the node.
         */
        if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
            (!(nd->nd_flags & MD_MN_NODE_OWN))) {
            if (clnt_joinset(nd->nd_nodename, sp,
                (MNSET_IN_RECONFIG | stale_flag), ep)) {
                /*
                 * If RPC failure to another node
                 * then exit without attempting anything else.
                 * (Reconfig return step will resume I/Os
                 * for all sets.)
                 */
                if (mdanyrpcerror(ep)) {
                    mde_perror(ep, "");
                    return (205);
                }
                /*
                 * STALE and ACCOK failures aren't true
                 * failures.  STALE means that <50% mddbs
                 * are available. ACCOK means that the
                 * mediator provided the extra vote.
                 * If a true failure, then print messasge
                 * and withdraw node from set in order to
                 * cleanup from failed join attempt.
                 */
                if ((!mdismddberror(ep, MDE_DB_STALE)) &&
                    (!mdismddberror(ep, MDE_DB_ACCOK))) {
                    mde_perror(ep,
                        "WARNING: Unable to join node %s "
                        "to set %s", nd->nd_nodename,
                        sp->setname);
                    mdclrerror(ep);
                    if (clnt_withdrawset(nd->nd_nodename,
                        sp, &xep))
                        mdclrerror(&xep);
                    nd = nd->nd_next;
                    continue;
                }
            }
            /* Set owner flag even if STALE or ACCOK */
            nd->nd_flags |= MD_MN_NODE_OWN;
        }
        nd = nd->nd_next;
    }
    /*
     * Resume I/Os if suspended above.
     */
    if (susp_res_flag) {
        nd = sd->sd_nodelist;
        while (nd) {
            /*
             * Skip non-alive and non-owner nodes
             * (this list doesn't include any of
             * the nodes that were joined).
             */
            if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
                (!(nd->nd_flags & MD_MN_NODE_OWN))) {
                nd = nd->nd_next;
                continue;
            }
            if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
                MN_RES_IO, ep)) {
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume I/O on node %s"
                    " in set %s"), nd->nd_nodename,
                    sp->setname);

                /*
                 * If an RPC failure then don't do any
                 * more RPC calls, since one timeout is enough
                 * to endure.  If RPC failure to another node,
                 * return 205.  If RPC failure to my node,
                 * return -1.
                 * (Reconfig return step will resume I/Os
                 * for all sets.)
                 * If not an RPC failure, continue resuming the
                 * rest of the nodes and then return -1.
                 */
                if (mdanyrpcerror(ep)) {
                    if (sd->sd_mn_mynode->nd_nodeid ==
                        nd->nd_nodeid) {
                        return (-1);
                    } else {
                        return (205);
                    }
                }

                /*
                 * If not an RPC error, continue resuming rest
                 * of nodes, ignoring any failures except for
                 * an RPC failure which constitutes an
                 * immediate exit.
                 * Start in middle of list with failing node.
                 */
                nd2 = nd->nd_next;
                while (nd2) {
                    /* Skip non-owner nodes  */
                    if ((!(nd2->nd_flags &
                        MD_MN_NODE_ALIVE)) ||
                        (!(nd2->nd_flags &
                        MD_MN_NODE_OWN))) {
                        nd2 = nd2->nd_next;
                        continue;
                    }
                    (void) (clnt_mn_susp_res_io(
                        nd2->nd_nodename, sp->setno,
                        MN_RES_IO, &xep));
                    if (mdanyrpcerror(&xep)) {
                        return (-1);
                    }
                    nd2 = nd2->nd_next;
                }
            }
            nd = nd->nd_next;
        }
    }

    nd = sd->sd_nodelist;
    while (nd) {
        if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
            nd = nd->nd_next;
            continue;
        }
        /*
         * If 1 node fails - go ahead and update the rest except
         * in the case of an RPC failure, fail immediately.
         */
        if (clnt_upd_nr_flags(nd->nd_nodename, sp,
            sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
            /* RPC failure to another node */
            if (mdanyrpcerror(ep)) {
                return (205);
            }
            nd = nd->nd_next;
            rval = -1;
            continue;
        }
        nd = nd->nd_next;
    }

    meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
        "Join of all nodes completed for set %s: %s"),
        sp->setname, meta_print_hrtime(gethrtime() - start_time));

    return (rval);
}