libmeta/common/meta_set_med.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * Metadevice diskset interfaces
 */

#include "meta_set_prv.h"
#include <sys/lvm/md_crc.h>
#include <sys/lvm/mdmed.h>

#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>

#define MALSIZ  32

static int
add_lst(char ***listp, char *item)
{
    int i, j;

    if (*listp) {
        for (i = 0; (*listp)[i]; i++)
            /* void */;
    } else {
        *listp = (char **)Zalloc(MALSIZ * sizeof (char *));
        i = 0;
    }

    (*listp)[i] = Strdup(item);

    if ((++i % MALSIZ) == 0) {
        *listp = (char **)Realloc((void *)*listp,
            (i + MALSIZ) * sizeof (char *));
        for (j = i; j < (i + MALSIZ); j++)
            (*listp)[j] = (char *)NULL;
    }
    return (i);
}

static int
del_lst(char ***listp)
{
    int i;

    if (*listp) {
        for (i = 0; (*listp)[i]; i++)
            free((*listp)[i]);
        free(*listp);
        *listp = NULL;
        return (1);
    } else
        return (0);
}


static int
validate_med_nodes(
    mdsetname_t *sp,
    md_h_arr_t  *mhp,
    md_error_t  *ep
)
{
    char        *hostname;
    char        *nodename;
    char        *nm;
    char        *cp;
    int     i, j;


    for (i = 0; i < MED_MAX_HOSTS; i++) {
        if (mhp->n_lst[i].a_cnt == 0)
            continue;

        for (j = 0; j < mhp->n_lst[i].a_cnt; j++) {
            nm = mhp->n_lst[i].a_nm[j];

            for (cp = nm; *cp; cp++)
                if (!isprint(*cp) ||
                    strchr(INVALID_IN_NAMES, *cp) != NULL)
                    return (mddserror(ep,
                        MDE_DS_INVALIDMEDNAME,
                        sp->setno, nm, NULL, sp->setname));

            if (clnt_med_hostname(nm, &hostname, ep))
                return (-1);

            if (j == 0) {
                if (strcmp(nm, hostname) != 0) {
                    Free(hostname);
                    return (mddserror(ep,
                        MDE_DS_NOTNODENAME, sp->setno, nm,
                        NULL, sp->setname));
                }
                nodename = nm;
            } else {
                if (strcmp(nodename, hostname) != 0) {
                    Free(hostname);
                    return (mddserror(ep,
                        MDE_DS_ALIASNOMATCH, sp->setno, nm,
                        nodename, sp->setname));
                }
            }
            Free(hostname);
        }
    }
    return (0);
}

/*
 * Exported Entry Points
 */

int
meta_set_addmeds(
    mdsetname_t     *sp,
    int         node_c,
    char            **node_v,
    md_error_t      *ep
)
{
    md_set_desc     *sd = NULL;
    md_drive_desc       *dd = NULL;
    mddb_med_parm_t     mp;
    mddb_med_upd_parm_t mup;
    md_h_arr_t      t;
    md_h_arr_t      rb_t;
    med_rec_t       medr;
    med_rec_t       rb_medr;
    char            *cp;
    char            **n_l = NULL;
    int         n_c = 0;
    int         i, j;
    sigset_t        oldsigs;
    md_setkey_t     *cl_sk;
    int         rb_level = 0;
    md_error_t      xep = mdnullerror;
    int         rval = 0;
    int         max_meds;
    md_mnnode_desc      *nd;
    int         suspend1_flag = 0;
    int         lock_flag = 0;

    /* Initialize */
    (void) memset(&t, '\0', sizeof (t));
    t.n_cnt = node_c;
    mdclrerror(ep);

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    /* Make sure we own the set */
    if (meta_check_ownership(sp, ep) != 0)
        return (-1);

    if ((max_meds = get_max_meds(ep)) == 0)
        return (-1);

    /*
     * The mediator information (which is part of the set record) is
     * stored in the local mddbs of each node in the diskset.
     * Each node's rpc.metad daemon reads in the set
     * records from that node's local mddb and caches them
     * internally. Any process needing diskset information contacts its
     * local rpc.metad to get this information.  Since each node in the
     * diskset is independently reading the set information from its local
     * mddb, the set records in the local mddbs must stay
     * in-sync, so that all nodes have a consistent view of the diskset.
     *
     * For a multinode diskset, explicitly verify that all nodes in the
     * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
     * fail this operation since all nodes must be ALIVE in order to add
     * the mediator information to the set record in their local mddb.
     * If a panic of this node leaves the local mddbs set records
     * out-of-sync, the reconfig cycle will fix the local mddbs and
     * force them back into synchronization.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
                    sp->setno,
                    nd->nd_nodename, NULL, sp->setname);
                return (-1);
            }
            nd = nd->nd_next;
        }
    }

    /* Parse the command line into a the md_h_arr_t structure */
    for (i = 0; i < t.n_cnt; i++) {
        cp = strtok(node_v[i], ",");
        j = 0;
        while (cp) {
            if (strlen(cp) > (size_t)MD_MAX_NODENAME)
                return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
                    sp->setno, cp, NULL, sp->setname));
            if (j >= MAX_HOST_ADDRS)
                return (mddserror(ep, MDE_DS_TOOMANYALIAS,
                    sp->setno, cp, NULL, sp->setname));

            (void) strcpy(t.n_lst[i].a_nm[j], cp);

            j++;

            cp = strtok(NULL, ",");
        }
        t.n_lst[i].a_cnt = j;
    }

    /* Make a list of nodes to check */
    for (i = 0; i < t.n_cnt; i++)
        for (j = 0; j < t.n_lst[i].a_cnt; j++)
            n_c = add_lst(&n_l, t.n_lst[i].a_nm[j]);

    /* Make sure that there are no redundant nodes */
    rval = nodesuniq(sp, n_c, n_l, ep);

    (void) del_lst(&n_l);

    if (rval != 0)
        return (rval);

    /*
     * Lock the set on current set members.
     * Set locking done much earlier for MN diskset than for traditional
     * diskset since lock_set and SUSPEND are used to protect against
     * other metaset commands running on the other nodes.
     */
    if (MD_MNSET_DESC(sd)) {
        /* Make sure we are blocking all signals */
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
                rval = -1;
                goto out;
            }
            lock_flag = 1;
            nd = nd->nd_next;
        }
        /*
         * Lock out other meta* commands by suspending
         * class 1 messages across the diskset.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename,
                COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
                MD_MSCF_NO_FLAGS, ep)) {
                rval = -1;
                goto out;
            }
            suspend1_flag = 1;
            nd = nd->nd_next;
        }
    }

    if (validate_med_nodes(sp, &t, ep)) {
        rval = -1;
        goto out;
    }

    /* Check existing mediators against new, if any */
    if (sd->sd_med.n_cnt > 0) {
        for (i = 0; i < max_meds; i++)
            if (sd->sd_med.n_lst[i].a_cnt > 0)
                n_c = add_lst(&n_l,
                    sd->sd_med.n_lst[i].a_nm[0]);

        for (i = 0; i < t.n_cnt; i++) {
            if (strinlst(t.n_lst[i].a_nm[0], n_c, n_l)) {
                (void) del_lst(&n_l);
                (void) mddserror(ep, MDE_DS_ISMED, sp->setno,
                    t.n_lst[i].a_nm[0], NULL,
                    sp->setname);
                rval = -1;
                goto out;
            }
        }
        (void) del_lst(&n_l);
    }

    if ((t.n_cnt + sd->sd_med.n_cnt) > max_meds) {
        (void) mderror(ep, MDE_TOOMANYMED, NULL);
        rval = -1;
        goto out;
    }

    /* Copy the current mediator list for rollback */
    rb_t = sd->sd_med;          /* structure assignment */

    /* Setup the mediator record roll-back structure */
    (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
    rb_medr.med_rec_mag = MED_REC_MAGIC;
    rb_medr.med_rec_rev = MED_REC_REV;
    rb_medr.med_rec_fl  = 0;
    rb_medr.med_rec_sn  = sp->setno;
    (void) strcpy(rb_medr.med_rec_snm, sp->setname);
    if (MD_MNSET_DESC(sd)) {
        /*
         * For a MN diskset the mediator is not given a list of
         * hosts in the set.  Instead a generic name (multiowner) is
         * given to the mediator which will allow any node to access
         * the mediator data as long as it provides the correct
         * setname and set number.  In a MN diskset, the mediator
         * data is only used when a first node joins the diskset
         * and becomes the master of the MN diskset.
         *
         * The traditional diskset code keeps the host list in
         * the mediator record up to date with respect to the host
         * list in the traditional diskset.  This keeps an unauthorized
         * node in the traditional diskset from accessing the data
         * in the mediator record and being able to 'take' the
         * diskset.
         *
         * This additional check is needed in the traditional diskset
         * since a panic during the metaset command can leave
         * the diskset with some nodes thinking that an
         * action has occurred and other nodes thinking the opposite.
         * A node may have really been removed from a diskset, but
         * that node doesn't realize this so this node must be
         * blocked from using the mediator data when attempting
         * to 'take' the diskset.
         * (Traditional diskset code has each node's rpc.metad
         * cleaning up from an inconsistent state without any
         * knowledge from the other nodes in the diskset).
         *
         * In the MN diskset, the reconfig steps force a consistent
         * state across all nodes in the diskset, so no node
         * needs to be blocked from accessing the mediator data.
         * This allow the MN diskset to use a common 'nodename'
         * in the mediator record.  This allows the mediator
         * daemon to remain unchanged even though a large number of
         * nodes are supported by the MN diskset.
         */
        (void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER,
            MD_MAX_NODENAME_PLUS_1);
    } else {
        for (i = 0; i < MD_MAXSIDES; i++)
            (void) strcpy(rb_medr.med_rec_nodes[i],
                sd->sd_nodes[i]);
    }
    rb_medr.med_rec_meds = sd->sd_med;  /* structure assigment */
    (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
    rb_medr.med_rec_foff = 0;
    crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);

    /* Merge new mediators into the set record */
    for (i = 0; i < t.n_cnt; i++) {
        for (j = 0; j < max_meds; j++) {
            if (sd->sd_med.n_lst[j].a_cnt > 0)
                continue;
            sd->sd_med.n_lst[j] = t.n_lst[i];
            SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ADD, SVM_TAG_MEDIATOR,
                sp->setno, j);
            sd->sd_med.n_cnt++;
            break;
        }
    }

    /*
     * Setup the kernel mediator list, which also validates that the
     * hosts have valid IP addresses
     */
    (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
    mp.med_setno = sp->setno;

    /* Copy the hostnames */
    if (meta_h2hi(&sd->sd_med, &mp.med, ep)) {
        rval = -1;
        goto out;
    }

    /* Resolve the IP addresses for the host list */
    if (meta_med_hnm2ip(&mp.med, ep)) {
        rval = -1;
        goto out;
    }

    /* Bring the mediator record up to date with the set record */
    medr = rb_medr;             /* structure assignment */
    medr.med_rec_meds = sd->sd_med;     /* structure assigment */
    crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);

    /* END CHECK CODE */

    /* Lock the set on current set members */
    if (!(MD_MNSET_DESC(sd))) {
        /* all signals already blocked for MN disket */
        md_rb_sig_handling_on();
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
                rval = -1;
                goto out;
            }
            lock_flag = 1;
        }
    }

    RB_TEST(1, "meta_set_addmeds", ep)

    RB_PREEMPT;
    rb_level = 1;   /* level 1 */

    RB_TEST(2, "meta_set_addmeds", ep)

    /*
     * Add the new mediator information to all hosts in the set.
     * For MN diskset, each node sends mediator list to its kernel.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            /* All nodes are guaranteed to be ALIVE */
            if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
                goto rollback;
            nd = nd->nd_next;
        }
    } else  {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
                goto rollback;
        }
    }

    RB_TEST(3, "meta_set_addmeds", ep)

    RB_PREEMPT;
    rb_level = 2;   /* level 2 */

    RB_TEST(4, "meta_set_addmeds", ep)

    if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep)) == NULL) {
        if (! mdisok(ep))
            goto rollback;
    }

    RB_TEST(5, "meta_set_addmeds", ep)

    RB_PREEMPT;
    rb_level = 3;   /* level 3 */

    RB_TEST(6, "meta_set_addmeds", ep)

    /* Inform the mediator hosts of the new information */
    for (i = 0; i < max_meds; i++) {
        if (sd->sd_med.n_lst[i].a_cnt == 0)
            continue;

        /* medr contains new mediator node list */
        if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
            goto rollback;
    }

    RB_TEST(7, "meta_set_addmeds", ep)

    RB_PREEMPT;
    rb_level = 4;   /* level 4 */

    RB_TEST(8, "meta_set_addmeds", ep)

    /* In MN diskset, mediator list updated in clnt_updmeds call */
    if (dd != NULL) {
        if (!(MD_MNSET_DESC(sd))) {
            if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde,
                NULL) != 0) {
                (void) mdstealerror(ep, &mp.med_mde);
                goto rollback;
            }
        }

        /*
         * If only 50% mddbs available, mediator will be
         * golden by this ioctl on a traditional diskset.
         *
         * On a MN disket, this only happens if the mediator
         * add operation is executed on the master node.
         * If a slave node is adding the mediator, the mediator
         * won't be marked golden until the next mddb change.
         */
        (void) memset(&mup, '\0', sizeof (mddb_med_upd_parm_t));
        mup.med_setno = sp->setno;
        if (metaioctl(MD_MED_UPD_MED, &mup, &mup.med_mde, NULL) != 0)
            mdclrerror(&mup.med_mde);
    }

out:
    if (suspend1_flag) {
        /*
         * Unlock diskset by resuming messages across the diskset.
         * Just resume all classes so that resume is the same whether
         * just one class was locked or all classes were locked.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }
    if (lock_flag) {
        cl_sk = cl_get_setkey(sp->setno, sp->setname);
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            while (nd) {
                /* All nodes are guaranteed to be ALIVE */
                if (clnt_unlock_set(nd->nd_nodename,
                    cl_sk, &xep)) {
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
                nd = nd->nd_next;
            }
        } else  {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_unlock_set(sd->sd_nodes[i],
                    cl_sk, &xep)) {
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
            }
        }
        cl_set_setkey(NULL);
    }

    metafreedrivedesc(&dd);

    if (MD_MNSET_DESC(sd)) {
        /* release signals back to what they were on entry */
        if (procsigs(FALSE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    } else {
        md_rb_sig_handling_off(md_got_sig(), md_which_sig());
    }

    return (rval);

rollback:
    /* all signals already blocked for MN disket */
    if (!(MD_MNSET_DESC(sd))) {
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    }

    rval = -1;

    /*
     * level 4
     * In MN diskset, mediator list updated in clnt_updmeds call
     */
    if (rb_level > 3 && (dd != NULL) && (!(MD_MNSET_DESC(sd)))) {
        (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
        mp.med_setno = sp->setno;
        (void) meta_h2hi(&rb_t, &mp.med, &xep);
        mdclrerror(&xep);
        (void) meta_med_hnm2ip(&mp.med, &xep);
        mdclrerror(&xep);
        (void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL);
    }

    /* level 3 */
    if (rb_level > 2) {
        for (i = 0; i < max_meds; i++) {
            if (sd->sd_med.n_lst[i].a_cnt == 0)
                continue;

            /*
             * rb_medr contains the rollback mediator node list.
             * Send the rollback mediator information to the
             * new mediator node list.  If a node had this RPC
             * called, but its node is not in the mediator node
             * list, rpc.metamedd will delete the mediator
             * record on that node.
             */
            if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
                &rb_medr, &xep))
                mdclrerror(&xep);
        }
    }

    /* level 2 */
    if (rb_level > 1) {
        metafreedrivedesc(&dd);
    }

    /* level 1 */
    if (rb_level > 0) {
        /* Delete mediator information from all hosts in the set */
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            while (nd) {
                /* All nodes are guaranteed to be ALIVE */
                if (clnt_updmeds(nd->nd_nodename, sp, &rb_t,
                    &xep))
                    mdclrerror(&xep);
                nd = nd->nd_next;
            }
        } else  {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t,
                    &xep))
                    mdclrerror(&xep);
            }
        }
    }

    /* level 0 */
    if (suspend1_flag) {
        /*
         * Unlock diskset by resuming messages across the diskset.
         * Just resume all classes so that resume is the same whether
         * just one class was locked or all classes were locked.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                mdclrerror(&xep);
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }
    if (lock_flag) {
        cl_sk = cl_get_setkey(sp->setno, sp->setname);
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            while (nd) {
                /* All nodes are guaranteed to be ALIVE */
                if (clnt_unlock_set(nd->nd_nodename,
                    cl_sk, &xep)) {
                    mdclrerror(&xep);
                }
                nd = nd->nd_next;
            }
        } else  {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_unlock_set(sd->sd_nodes[i],
                    cl_sk, &xep)) {
                    mdclrerror(&xep);
                }
            }
        }
        cl_set_setkey(NULL);
    }

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    if (!(MD_MNSET_DESC(sd))) {
        md_rb_sig_handling_off(md_got_sig(), md_which_sig());
    }

    return (rval);
}

int
meta_set_deletemeds(
    mdsetname_t     *sp,
    int         node_c,
    char            **node_v,
    int         forceflg,
    md_error_t      *ep
)
{
    md_set_desc     *sd = NULL;
    md_drive_desc       *dd = NULL;
    mddb_med_parm_t     mp;
    md_h_arr_t      rb_t;
    med_rec_t       medr;
    med_rec_t       rb_medr;
    int         i, j;
    char            **n_l = NULL;
    int         n_c = 0;
    sigset_t        oldsigs;
    md_setkey_t     *cl_sk;
    int         rb_level = 0;
    md_error_t      xep = mdnullerror;
    int         rval = 0;
    int         max_meds;
    md_mnnode_desc      *nd;
    int         suspend1_flag = 0;
    int         lock_flag = 0;

    mdclrerror(ep);

    if ((sd = metaget_setdesc(sp, ep)) == NULL)
        return (-1);

    /* Make sure we own the set */
    if (meta_check_ownership(sp, ep) != 0)
        return (-1);

    for (i = 0; i < node_c; i++)
        if (strchr(node_v[i], ',') != NULL)
            return (mderror(ep, MDE_ONLYNODENAME, node_v[i]));

    if (nodesuniq(sp, node_c, node_v, ep))
        return (-1);

    if ((max_meds = get_max_meds(ep)) == 0)
        return (-1);

    /*
     * The mediator information (which is part of the set record) is
     * stored in the local mddbs of each node in the diskset.
     * Each node's rpc.metad daemon reads in the set
     * records from that node's local mddb and caches them
     * internally. Any process needing diskset information contacts its
     * local rpc.metad to get this information.  Since each node in the
     * diskset is independently reading the set information from its local
     * mddb, the set records in the local mddbs must stay
     * in-sync, so that all nodes have a consistent view of the diskset.
     *
     * For a multinode diskset, explicitly verify that all nodes in the
     * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
     * fail this operation since all nodes must be ALIVE in order to delete
     * the mediator information from the set record in their local mddb.
     * If a panic of this node leaves the local mddbs set records
     * out-of-sync, the reconfig cycle will fix the local mddbs and
     * force them back into synchronization.
     */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
                (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
                    sp->setno,
                    nd->nd_nodename, NULL, sp->setname);
                return (-1);
            }
            nd = nd->nd_next;
        }
    }

    if (sd->sd_med.n_cnt == 0)
        return (mderror(ep, MDE_NOMED, NULL));

    /* Make a list of nodes to check */
    for (i = 0; i < max_meds; i++)
        if (sd->sd_med.n_lst[i].a_cnt > 0)
            n_c = add_lst(&n_l, sd->sd_med.n_lst[i].a_nm[0]);

    for (i = 0; i < node_c; i++) {
        if (! strinlst(node_v[i], n_c, n_l)) {
            (void) del_lst(&n_l);
            return (mddserror(ep, MDE_DS_ISNOTMED, sp->setno,
                node_v[i], NULL, sp->setname));
        }
    }

    (void) del_lst(&n_l);

    /* Save a copy of the current mediator information */
    rb_t = sd->sd_med;          /* structure assignment */

    /* Setup the mediator record for rollback */
    (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
    rb_medr.med_rec_mag = MED_REC_MAGIC;
    rb_medr.med_rec_rev = MED_REC_REV;
    rb_medr.med_rec_fl  = 0;
    rb_medr.med_rec_sn  = sp->setno;
    (void) strcpy(rb_medr.med_rec_snm, sp->setname);
    if (MD_MNSET_DESC(sd)) {
        /*
         * In MN diskset, use a generic nodename, multiowner, in the
         * mediator record which allows any node to access mediator
         * information.  MN diskset reconfig cycle forces consistent
         * view of set/node/drive/mediator information across all nodes
         * in the MN diskset.  This allows the relaxation of
         * node name checking in rpc.metamedd for MN disksets.
         *
         * In the traditional diskset, only a node that is in the
         * mediator record's diskset nodelist can access mediator
         * data.
         */
        (void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER,
            MD_MAX_NODENAME_PLUS_1);
    } else {
        for (i = 0; i < MD_MAXSIDES; i++)
            (void) strcpy(rb_medr.med_rec_nodes[i],
                sd->sd_nodes[i]);
    }
    rb_medr.med_rec_meds = sd->sd_med;  /* structure assignment */
    (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
    rb_medr.med_rec_foff = 0;
    crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);

    /* Delete the mediators requested from the set */
    for (i = 0; i < node_c; i++) {
        for (j = 0; j < max_meds; j++) {
            if (sd->sd_med.n_lst[j].a_cnt == 0)
                continue;
            if (strcmp(node_v[i],
                sd->sd_med.n_lst[j].a_nm[0]) != 0)
                continue;
            SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
                SVM_TAG_MEDIATOR, sp->setno, j);
            (void) memset(&sd->sd_med.n_lst[j], '\0',
                sizeof (md_h_t));
            sd->sd_med.n_cnt--;
            break;
        }
    }

    medr = rb_medr;             /* structure assignment */
    medr.med_rec_meds = sd->sd_med;     /* structure assignment */
    crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);

    /* END CHECK CODE */

    /* Lock the set on current set members */
    if (MD_MNSET_DESC(sd)) {
        /* Make sure we are blocking all signals */
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
        /*
         * Lock the set on current set members.
         * lock_set and SUSPEND are used to protect against
         * other metaset commands running on the other nodes.
         */
        nd = sd->sd_nodelist;
        while (nd) {
            /* All nodes are guaranteed to be ALIVE */
            if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
                if (forceflg && strcmp(mynode(),
                    nd->nd_nodename) != 0) {
                    mdclrerror(ep);
                    nd = nd->nd_next;
                    continue;
                }
                rval = -1;
                goto out;
            }
            lock_flag = 1;
            nd = nd->nd_next;
        }
        /*
         * Lock out other meta* commands by suspending
         * class 1 messages across the diskset.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename,
                COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
                MD_MSCF_NO_FLAGS, ep)) {
                rval = -1;
                goto out;
            }
            suspend1_flag = 1;
            nd = nd->nd_next;
        }
    } else  {
        md_rb_sig_handling_on();
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
                if (forceflg &&
                    strcmp(mynode(), sd->sd_nodes[i]) != 0) {
                    mdclrerror(ep);
                    continue;
                }
                rval = -1;
                goto out;
            }
            lock_flag = 1;
        }
    }

    RB_TEST(1, "meta_set_deletemeds", ep)

    RB_PREEMPT;
    rb_level = 1;   /* level 1 */

    RB_TEST(2, "meta_set_deletemeds", ep)

    /* Update the mediator information on all hosts in the set */
    if (MD_MNSET_DESC(sd)) {
        nd = sd->sd_nodelist;
        while (nd) {
            /* All nodes are guaranteed to be ALIVE */
            if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med,
                ep)) {
                if (forceflg && strcmp(mynode(),
                    nd->nd_nodename) != 0) {
                    mdclrerror(ep);
                    continue;
                }
                goto rollback;
            }
            nd = nd->nd_next;
        }
    } else  {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med,
                ep)) {
                if (forceflg && strcmp(mynode(),
                    sd->sd_nodes[i]) != 0) {
                    mdclrerror(ep);
                    continue;
                }
                goto rollback;
            }
        }
    }

    RB_TEST(3, "meta_set_deletemeds", ep)

    RB_PREEMPT;
    rb_level = 2;   /* level 2 */

    RB_TEST(5, "meta_set_deletemeds", ep)

    if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
        ep)) == NULL) {
        if (! mdisok(ep))
            goto rollback;
    }

    RB_TEST(5, "meta_set_deletemeds", ep)

    RB_PREEMPT;
    rb_level = 3;   /* level 3 */

    RB_TEST(6, "meta_set_deletemeds", ep)

    if (dd != NULL) {
        /*
         * Set up the parameters to the call to update the
         * kernel mediator list
         */
        (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
        mp.med_setno = sp->setno;
        if (meta_h2hi(&sd->sd_med, &mp.med, ep))
            goto rollback;

        /* Resolve the IP addresses for the host list */
        if (meta_med_hnm2ip(&mp.med, ep))
            goto rollback;

        if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL) != 0) {
            (void) mdstealerror(ep, &mp.med_mde);
            goto rollback;
        }
    }

    RB_TEST(7, "meta_set_deletemeds", ep)

    RB_PREEMPT;
    rb_level = 4;   /* level 4 */

    RB_TEST(8, "meta_set_deletemeds", ep)

    /* Inform the mediator hosts of the new status */
    for (i = 0; i < max_meds; i++) {
        if (rb_t.n_lst[i].a_cnt == 0)
            continue;

        /*
         * medr contains the new mediator node list.
         * Send the new mediator information to the
         * new mediator node list.  If a node had this RPC
         * called, but its node is no longer in the new mediator
         * node list, rpc.metamedd will delete the mediator
         * record on that node.
         */
        if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &medr, ep)) {
            if ((forceflg && mdanyrpcerror(ep)) ||
                mdisrpcerror(ep, RPC_PROGNOTREGISTERED)) {
                mdclrerror(ep);
                continue;
            }
            goto rollback;
        }
    }

out:
    if (dd)
        metafreedrivedesc(&dd);

    if (suspend1_flag) {
        /*
         * Unlock diskset by resuming messages across the diskset.
         * Just resume all classes so that resume is the same whether
         * just one class was locked or all classes were locked.
         */
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
                sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
                if (rval == 0)
                    (void) mdstealerror(ep, &xep);
                rval = -1;
                mde_perror(ep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
            }
            nd = nd->nd_next;
        }
        meta_ping_mnset(sp->setno);
    }

    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    if (lock_flag) {
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            while (nd) {
                /* All nodes are guaranteed to be ALIVE */
                if (clnt_unlock_set(nd->nd_nodename,
                    cl_sk, &xep)) {
                    if (forceflg &&
                        strcmp(mynode(),
                        nd->nd_nodename) != 0) {
                        mdclrerror(ep);
                        continue;
                    }
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
                nd = nd->nd_next;
            }
        } else {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_unlock_set(sd->sd_nodes[i],
                    cl_sk, &xep)) {
                    if (forceflg &&
                        strcmp(mynode(),
                        sd->sd_nodes[i]) != 0) {
                        mdclrerror(ep);
                        continue;
                    }
                    if (rval == 0)
                        (void) mdstealerror(ep, &xep);
                    rval = -1;
                }
            }
        }
    }
    cl_set_setkey(NULL);

    if (MD_MNSET_DESC(sd)) {
        /* release signals back to what they were on entry */
        if (procsigs(FALSE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    } else {
        md_rb_sig_handling_off(md_got_sig(), md_which_sig());
    }

    return (rval);

rollback:
    /* all signals already blocked for MN disket */
    if (!(MD_MNSET_DESC(sd))) {
        if (procsigs(TRUE, &oldsigs, &xep) < 0)
            mdclrerror(&xep);
    }

    rval = -1;

    (void) del_lst(&n_l);

    /* level 4 */
    if (rb_level > 4) {
        for (i = 0; i < max_meds; i++) {
            if (rb_t.n_lst[i].a_cnt == 0)
                continue;

            /*
             * rb_medr contains the rollback mediator node list.
             * Send the rollback mediator information to the
             * new mediator node list.  This will recreate the
             * mediator record on all nodes where the mediator
             * record had been removed.
             */
            if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &rb_medr,
                &xep))
                mdclrerror(&xep);
        }
    }

    /* level 3 */
    if (rb_level > 2 && dd != NULL) {
        (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
        mp.med_setno = sp->setno;
        (void) meta_h2hi(&rb_t, &mp.med, &xep);
        mdclrerror(&xep);
        (void) meta_med_hnm2ip(&mp.med, &xep);
        mdclrerror(&xep);
        (void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL);
    }

    /* level 2 */
    if (rb_level > 1) {
        metafreedrivedesc(&dd);
    }

    /* level 1 */
    if (rb_level > 0) {
        /* Delete mediator information from all hosts in the set */
        if (MD_MNSET_DESC(sd)) {
            nd = sd->sd_nodelist;
            while (nd) {
                /* All nodes are guaranteed to be ALIVE */
                if (clnt_updmeds(nd->nd_nodename, sp, &rb_t,
                    &xep))
                    mdclrerror(&xep);
                nd = nd->nd_next;
            }
        } else  {
            for (i = 0; i < MD_MAXSIDES; i++) {
                /* Skip empty slots */
                if (sd->sd_nodes[i][0] == '\0')
                    continue;

                if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t,
                    &xep))
                    mdclrerror(&xep);
            }
        }
    }

    /* level 0 */
    cl_sk = cl_get_setkey(sp->setno, sp->setname);
    /* Unlock the set */
    /* Don't test lock flag since guaranteed to be set if in rollback */
    if (MD_MNSET_DESC(sd)) {
        /*
         * Unlock diskset by resuming messages across the diskset.
         * Just resume all classes so that resume is the same whether
         * just one class was locked or all classes were locked.
         */
        if (suspend1_flag) {
            /* All nodes are guaranteed to be ALIVE */
            nd = sd->sd_nodelist;
            while (nd) {
                if (clnt_mdcommdctl(nd->nd_nodename,
                    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
                    MD_MSCF_NO_FLAGS, &xep)) {
                    mde_perror(&xep, dgettext(TEXT_DOMAIN,
                    "Unable to resume rpc.mdcommd.\n"));
                    mdclrerror(&xep);
                }
                nd = nd->nd_next;
            }
            meta_ping_mnset(sp->setno);
        }
        nd = sd->sd_nodelist;
        /* All nodes are guaranteed to be ALIVE */
        while (nd) {
            if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
                mdclrerror(&xep);
            nd = nd->nd_next;
        }
    } else  {
        for (i = 0; i < MD_MAXSIDES; i++) {
            /* Skip empty slots */
            if (sd->sd_nodes[i][0] == '\0')
                continue;

            if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
                mdclrerror(&xep);
        }
    }
    cl_set_setkey(NULL);

    /* release signals back to what they were on entry */
    if (procsigs(FALSE, &oldsigs, &xep) < 0)
        mdclrerror(&xep);

    if (!(MD_MNSET_DESC(sd))) {
        md_rb_sig_handling_off(md_got_sig(), md_which_sig());
    }

    return (rval);
}