2N/A/*
2N/A * CDDL HEADER START
2N/A *
2N/A * The contents of this file are subject to the terms of the
2N/A * Common Development and Distribution License (the "License").
2N/A * You may not use this file except in compliance with the License.
2N/A *
2N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A * or http://www.opensolaris.org/os/licensing.
2N/A * See the License for the specific language governing permissions
2N/A * and limitations under the License.
2N/A *
2N/A * When distributing Covered Code, include this CDDL HEADER in each
2N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A * If applicable, add the following below this CDDL HEADER, with the
2N/A * fields enclosed by brackets "[]" replaced with your own identifying
2N/A * information: Portions Copyright [yyyy] [name of copyright owner]
2N/A *
2N/A * CDDL HEADER END
2N/A */
2N/A/*
2N/A * Copyright (c) 1994, 2012, Oracle and/or its affiliates. All rights reserved.
2N/A */
2N/A
2N/A/*
2N/A * Just in case we're not in a build environment, make sure that
2N/A * TEXT_DOMAIN gets set to something.
2N/A */
2N/A#if !defined(TEXT_DOMAIN)
2N/A#define TEXT_DOMAIN "SYS_TEST"
2N/A#endif
2N/A
2N/A/*
2N/A * Metadevice diskset interfaces
2N/A */
2N/A
2N/A#include "meta_set_prv.h"
2N/A#include <meta.h>
2N/A#include <metad.h>
2N/A#include <mdmn_changelog.h>
2N/A#include <sys/lvm/md_crc.h>
2N/A#include <sys/utsname.h>
2N/A#include <sdssc.h>
2N/A#include <sys/cladm.h>
2N/A
2N/A#include <sys/sysevent/eventdefs.h>
2N/A#include <sys/sysevent/svm.h>
2N/Aextern char *blkname(char *);
2N/Aextern int meta_devid_supported_in_did(void);
2N/A
2N/Astatic md_drive_desc *
2N/Adr2drivedesc(
2N/A mdsetname_t *sp,
2N/A side_t sideno,
2N/A int flags,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_record *sr;
2N/A md_drive_record *dr;
2N/A mddrivename_t *dnp;
2N/A md_drive_desc *dd_head = NULL;
2N/A md_set_desc *sd;
2N/A
2N/A if (flags & MD_BYPASS_DAEMON) {
2N/A if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
2N/A return (NULL);
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (NULL);
2N/A sideno = getnodeside(mynode(), sd);
2N/A sp = metafakesetname(sp->setno, sr->sr_setname);
2N/A } else {
2N/A if ((sr = getsetbyname(sp->setname, ep)) == NULL)
2N/A return (NULL);
2N/A }
2N/A
2N/A assert(sideno != MD_SIDEWILD);
2N/A
2N/A /*
2N/A * WARNING:
2N/A * The act of getting the dnp from the namespace means that we
2N/A * will get the devid of the disk as recorded in the namespace.
2N/A * This devid has the potential to be stale if the disk is being
2N/A * replaced via a rebind, this means that any code that relies
2N/A * on any of the dnp information should take the appropriate action
2N/A * to preserve that information. For example in the rebind code the
2N/A * devid of the new disk is saved off and then copied back in once
2N/A * the code that has called this function has completed.
2N/A */
2N/A for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
2N/A if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
2N/A flags, ep)) == NULL) {
2N/A if (!(flags & MD_BYPASS_DAEMON))
2N/A free_sr(sr);
2N/A metafreedrivedesc(&dd_head);
2N/A return (NULL);
2N/A }
2N/A
2N/A (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
2N/A dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
2N/A }
2N/A
2N/A if (!(flags & MD_BYPASS_DAEMON)) {
2N/A free_sr(sr);
2N/A }
2N/A return (dd_head);
2N/A}
2N/A
2N/Astatic int
2N/Aget_sidenmlist(
2N/A mdsetname_t *sp,
2N/A mddrivename_t *dnp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A mdsidenames_t *sn, **sn_next;
2N/A int i;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A metaflushsidenames(dnp);
2N/A sn_next = &dnp->side_names;
2N/A if (MD_MNSET_DESC(sd)) {
2N/A /*
2N/A * Only get sidenames for this node since
2N/A * that is the only side information stored in
2N/A * the local mddb for a multi-node diskset.
2N/A */
2N/A if (sd->sd_mn_mynode) {
2N/A sn = Zalloc(sizeof (*sn));
2N/A sn->sideno = sd->sd_mn_mynode->nd_nodeid;
2N/A if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
2N/A sn->sideno, dnp->side_names_key, &sn->dname,
2N/A &sn->mnum, NULL, ep)) == NULL) {
2N/A if (sn->dname != NULL)
2N/A Free(sn->dname);
2N/A Free(sn);
2N/A return (-1);
2N/A }
2N/A
2N/A /* Add to the end of the linked list */
2N/A assert(*sn_next == NULL);
2N/A *sn_next = sn;
2N/A sn_next = &sn->next;
2N/A }
2N/A } else {
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A sn = Zalloc(sizeof (*sn));
2N/A sn->sideno = i;
2N/A if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
2N/A i+SKEW, dnp->side_names_key, &sn->dname,
2N/A &sn->mnum, NULL, ep)) == NULL) {
2N/A /*
2N/A * It is possible that during the add of a
2N/A * host to have a 'missing' side as the side
2N/A * for this disk will be added later. So ignore
2N/A * the error. The 'missing' side will be added
2N/A * once the addhosts process has completed.
2N/A */
2N/A if (mdissyserror(ep, ENOENT)) {
2N/A mdclrerror(ep);
2N/A Free(sn);
2N/A continue;
2N/A }
2N/A
2N/A if (sn->dname != NULL)
2N/A Free(sn->dname);
2N/A Free(sn);
2N/A return (-1);
2N/A }
2N/A
2N/A /* Add to the end of the linked list */
2N/A assert(*sn_next == NULL);
2N/A *sn_next = sn;
2N/A sn_next = &sn->next;
2N/A }
2N/A }
2N/A
2N/A return (0);
2N/A}
2N/A
2N/Astatic md_drive_desc *
2N/Arl_to_dd(
2N/A mdsetname_t *sp,
2N/A md_replicalist_t *rlp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_replicalist_t *rl;
2N/A md_replica_t *r;
2N/A md_drive_desc *dd = NULL;
2N/A md_drive_desc *d;
2N/A int found;
2N/A md_set_desc *sd;
2N/A daddr_t nblks = 0;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (NULL);
2N/A
2N/A /* find the smallest existing replica */
2N/A for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2N/A r = rl->rl_repp;
2N/A nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2N/A }
2N/A
2N/A if (nblks <= 0)
2N/A nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
2N/A
2N/A for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2N/A r = rl->rl_repp;
2N/A
2N/A found = 0;
2N/A for (d = dd; d != NULL; d = d->dd_next) {
2N/A if (strcmp(r->r_namep->drivenamep->cname,
2N/A d->dd_dnp->cname) == 0) {
2N/A found = 1;
2N/A dd->dd_dbcnt++;
2N/A break;
2N/A }
2N/A }
2N/A
2N/A if (! found)
2N/A (void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
2N/A 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
2N/A }
2N/A
2N/A return (dd);
2N/A}
2N/A
2N/A/*
2N/A * Exported Entry Points
2N/A */
2N/A
2N/Aset_t
2N/Aget_max_sets(md_error_t *ep)
2N/A{
2N/A
2N/A static set_t max_sets = 0;
2N/A
2N/A if (max_sets == 0)
2N/A if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
2N/A return (0);
2N/A
2N/A return (max_sets);
2N/A}
2N/A
2N/Aint
2N/Aget_max_meds(md_error_t *ep)
2N/A{
2N/A static int max_meds = 0;
2N/A
2N/A if (max_meds == 0)
2N/A if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
2N/A return (0);
2N/A
2N/A return (max_meds);
2N/A}
2N/A
2N/Aside_t
2N/Agetmyside(mdsetname_t *sp, md_error_t *ep)
2N/A{
2N/A md_set_desc *sd;
2N/A char *node = NULL;
2N/A side_t sideno;
2N/A md_set_record *sr;
2N/A
2N/A if (sp->setno == 0)
2N/A return (0);
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (MD_SIDEWILD);
2N/A
2N/A /*
2N/A * We may not have a nodename yet if early in boot and using
2N/A * auto take sets, so find side based on name in set.
2N/A */
2N/A node = mynode();
2N/A
2N/A if ((strcmp(node, "") == 0) && (sd->sd_flags & MD_SR_AUTO_TAKE)) {
2N/A int j;
2N/A
2N/A if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
2N/A return (MD_SIDEWILD);
2N/A /*
2N/A * We know there can only be one host in an auto take set
2N/A * but there are cases where it is not side 0, so check.
2N/A */
2N/A for (j = 0; j < MD_MAXSIDES; j++) {
2N/A /* Skip empty slots */
2N/A if (sr->sr_nodes[j][0] == '\0')
2N/A continue;
2N/A
2N/A node = sr->sr_nodes[j];
2N/A }
2N/A }
2N/A
2N/A
2N/A sideno = getnodeside(node, sd);
2N/A
2N/A if (sideno != MD_SIDEWILD)
2N/A return (sideno);
2N/A
2N/A return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
2N/A}
2N/A
2N/A/*
2N/A * get set info from name
2N/A */
2N/Amd_set_record *
2N/Agetsetbyname(char *setname, md_error_t *ep)
2N/A{
2N/A md_set_record *sr = NULL;
2N/A md_mnset_record *mnsr = NULL;
2N/A char *p;
2N/A size_t len;
2N/A
2N/A /* get set info from daemon */
2N/A if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
2N/A return (NULL);
2N/A if (sr != NULL) {
2N/A /*
2N/A * Returned record could be for a multi-node set or a
2N/A * non-multi-node set.
2N/A */
2N/A if (MD_MNSET_REC(sr)) {
2N/A /*
2N/A * Record is for a multi-node set. Reissue call
2N/A * to get mnset information. Need to free
2N/A * record as if a non-multi-node set record since
2N/A * that is what clnt_getset gave us. If in
2N/A * the daemon, don't free since this is a pointer
2N/A * into the setrecords array.
2N/A */
2N/A if (! md_in_daemon) {
2N/A sr->sr_flags &= ~MD_SR_MN;
2N/A free_sr(sr);
2N/A }
2N/A if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
2N/A ep) == -1)
2N/A return (NULL);
2N/A if (mnsr != NULL)
2N/A return ((struct md_set_record *)mnsr);
2N/A } else {
2N/A return (sr);
2N/A }
2N/A }
2N/A
2N/A /* no such set */
2N/A len = strlen(setname) + 30;
2N/A p = Malloc(len);
2N/A (void) snprintf(p, len, "setname \"%s\"", setname);
2N/A (void) mderror(ep, MDE_NO_SET, p);
2N/A Free(p);
2N/A return (NULL);
2N/A}
2N/A
2N/A/*
2N/A * get set info from number
2N/A */
2N/Amd_set_record *
2N/Agetsetbynum(set_t setno, md_error_t *ep)
2N/A{
2N/A md_set_record *sr;
2N/A md_mnset_record *mnsr = NULL;
2N/A char buf[100];
2N/A
2N/A if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
2N/A return (NULL);
2N/A
2N/A if (sr != NULL) {
2N/A /*
2N/A * Record is for a multi-node set. Reissue call
2N/A * to get mnset information. Need to free
2N/A * record as if a non-multi-node set record since
2N/A * that is what clnt_getset gave us. If in
2N/A * the daemon, don't free since this is a pointer
2N/A * into the setrecords array.
2N/A */
2N/A if (MD_MNSET_REC(sr)) {
2N/A /*
2N/A * Record is for a multi-node set. Reissue call
2N/A * to get mnset information.
2N/A */
2N/A if (! md_in_daemon) {
2N/A sr->sr_flags &= ~MD_SR_MN;
2N/A free_sr(sr);
2N/A }
2N/A if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
2N/A ep) == -1)
2N/A return (NULL);
2N/A if (mnsr != NULL)
2N/A return ((struct md_set_record *)mnsr);
2N/A } else {
2N/A return (sr);
2N/A }
2N/A }
2N/A
2N/A (void) sprintf(buf, "setno %u", setno);
2N/A (void) mderror(ep, MDE_NO_SET, buf);
2N/A return (NULL);
2N/A}
2N/A
2N/Aint
2N/Ameta_check_drive_inuse(
2N/A mdsetname_t *sp,
2N/A mddrivename_t *dnp,
2N/A int check_db,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A mdnamelist_t *nlp = NULL;
2N/A mdnamelist_t *p;
2N/A int rval = 0;
2N/A
2N/A /* get all underlying partitions */
2N/A if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
2N/A return (-1);
2N/A
2N/A /* search for drive */
2N/A for (p = nlp; (p != NULL); p = p->next) {
2N/A mdname_t *np = p->namep;
2N/A
2N/A if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
2N/A rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
2N/A NULL, dnp->cname, sp->setname));
2N/A break;
2N/A }
2N/A }
2N/A
2N/A /* cleanup, return success */
2N/A metafreenamelist(nlp);
2N/A return (rval);
2N/A}
2N/A
2N/A/*
2N/A * simple check for ownership
2N/A */
2N/Aint
2N/Ameta_check_ownership(mdsetname_t *sp, md_error_t *ep)
2N/A{
2N/A int ownset;
2N/A md_set_desc *sd;
2N/A md_drive_desc *dd;
2N/A md_replicalist_t *rlp = NULL;
2N/A md_error_t xep = mdnullerror;
2N/A
2N/A if (metaislocalset(sp))
2N/A return (0);
2N/A
2N/A ownset = own_set(sp, NULL, TRUE, ep);
2N/A if (! mdisok(ep))
2N/A return (-1);
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
2N/A if (! mdisok(ep))
2N/A return (-1);
2N/A
2N/A /* If we have no drive descriptors, check for no ownership */
2N/A if (dd == NULL) {
2N/A if (ownset == MD_SETOWNER_NONE)
2N/A return (0);
2N/A
2N/A /* If ownership somehow has come to exist, we must clean up */
2N/A
2N/A if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
2N/A &xep) < 0)
2N/A mdclrerror(&xep);
2N/A
2N/A if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
2N/A if (! mdisok(&xep))
2N/A mdclrerror(&xep);
2N/A
2N/A if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
2N/A if (rel_own_bydd(sp, dd, TRUE, &xep))
2N/A mdclrerror(&xep);
2N/A }
2N/A
2N/A if (halt_set(sp, &xep))
2N/A mdclrerror(&xep);
2N/A
2N/A metafreereplicalist(rlp);
2N/A
2N/A metafreedrivedesc(&dd);
2N/A
2N/A return (0);
2N/A }
2N/A
2N/A metafreedrivedesc(&sd->sd_drvs);
2N/A
2N/A if (ownset == MD_SETOWNER_YES)
2N/A return (0);
2N/A
2N/A return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
2N/A sp->setname));
2N/A}
2N/A
2N/A/*
2N/A * simple check for ownership
2N/A */
2N/Aint
2N/Ameta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
2N/A{
2N/A md_set_desc *sd;
2N/A md_drive_desc *dd;
2N/A int bool;
2N/A
2N/A if (metaislocalset(sp))
2N/A return (0);
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A if (getnodeside(hostname, sd) == MD_SIDEWILD)
2N/A return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2N/A hostname, NULL, sp->setname));
2N/A
2N/A dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
2N/A if (! mdisok(ep))
2N/A return (-1);
2N/A
2N/A if (clnt_ownset(hostname, sp, &bool, ep) == -1)
2N/A return (-1);
2N/A
2N/A if (dd == NULL)
2N/A return (0);
2N/A
2N/A metafreedrivedesc(&sd->sd_drvs);
2N/A
2N/A if (bool == TRUE)
2N/A return (0);
2N/A
2N/A return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
2N/A sp->setname));
2N/A}
2N/A
2N/A/*
2N/A * Function that determines if a node is in the multinode diskset
2N/A * membership list. Calling node passes in node to be checked and
2N/A * the nodelist as returned from meta_read_nodelist. This routine
2N/A * anticipates being called many times using the same diskset membership
2N/A * list which is why the alloc and free of the diskset membership list
2N/A * is left to the calling routine.
2N/A * Returns:
2N/A * 1 - if a member
2N/A * 0 - not a member
2N/A */
2N/Aint
2N/Ameta_is_member(
2N/A char *node_name,
2N/A md_mn_nodeid_t node_id,
2N/A mndiskset_membershiplist_t *nl
2N/A)
2N/A{
2N/A mndiskset_membershiplist_t *nl2;
2N/A int flag_check_name;
2N/A
2N/A if (node_id != 0)
2N/A flag_check_name = 0;
2N/A else if (node_name != NULL)
2N/A flag_check_name = 1;
2N/A else
2N/A return (0);
2N/A
2N/A nl2 = nl;
2N/A while (nl2) {
2N/A if (flag_check_name) {
2N/A /* Compare given name against name in member list */
2N/A if (strcmp(nl2->msl_node_name, node_name) == 0)
2N/A break;
2N/A } else {
2N/A /* Compare given nodeid against nodeid in member list */
2N/A if (nl2->msl_node_id == node_id)
2N/A break;
2N/A }
2N/A nl2 = nl2->next;
2N/A }
2N/A /* No match found in member list */
2N/A if (nl2 == NULL) {
2N/A return (0);
2N/A }
2N/A /* Return 1 if node is in member list */
2N/A return (1);
2N/A}
2N/A
2N/A/*
2N/A * meta_getnext_devinfo should go to the host that
2N/A * has the device, to return the device name, driver name, minor num.
2N/A * We can take the big cheat for now, since it is a requirement
2N/A * that the device names and device numbers are the same, and
2N/A * just get the info locally.
2N/A *
2N/A * This routine is very similar to meta_getnextside_devinfo except
2N/A * that the specific side to be used is being passed in.
2N/A *
2N/A * Exit status:
2N/A * 0 - No more side info to return
2N/A * 1 - More side info's to return
2N/A * -1 - An error has been detected
2N/A */
2N/A/*ARGSUSED*/
2N/Aint
2N/Ameta_getside_devinfo(
2N/A mdsetname_t *sp, /* for this set */
2N/A char *bname, /* local block name (myside) */
2N/A side_t sideno, /* sideno */
2N/A char **ret_bname, /* block device name of returned side */
2N/A char **ret_dname, /* driver name of returned side */
2N/A minor_t *ret_mnum, /* minor number of returned side */
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A mdname_t *np;
2N/A
2N/A if (ret_bname != NULL)
2N/A *ret_bname = NULL;
2N/A if (ret_dname != NULL)
2N/A *ret_dname = NULL;
2N/A if (ret_mnum != NULL)
2N/A *ret_mnum = NODEV32;
2N/A
2N/A
2N/A if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A/*
2N/A * NOTE (future) - There will be more work here once devids are integrated
2N/A * into disksets. Then the side should be used to find the correct
2N/A * host and the b/d names should be gotten from that host.
2N/A */
2N/A
2N/A /*
2N/A * Return the side info.
2N/A */
2N/A if (ret_bname != NULL)
2N/A *ret_bname = Strdup(np->bname);
2N/A
2N/A if (ret_dname != NULL) {
2N/A mdcinfo_t *cinfo;
2N/A
2N/A if ((cinfo = metagetcinfo(np, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A *ret_dname = Strdup(cinfo->dname);
2N/A }
2N/A
2N/A if (ret_mnum != NULL)
2N/A *ret_mnum = meta_getminor(np->dev);
2N/A
2N/A return (1);
2N/A}
2N/A
2N/A/*
2N/A * Get the information on the device from the remote node using the devid
2N/A * of the disk.
2N/A *
2N/A * Exit status:
2N/A * 0 - No more side info to return
2N/A * 1 - More side info's to return
2N/A * -1 - An error has been detected
2N/A */
2N/Aint
2N/Ameta_getnextside_devinfo(
2N/A mdsetname_t *sp, /* for this set */
2N/A char *bname, /* local block name (myside) */
2N/A side_t *sideno, /* previous sideno & returned sideno */
2N/A char **ret_bname, /* block device name of returned side */
2N/A char **ret_dname, /* driver name of returned side */
2N/A minor_t *ret_mnum, /* minor number of returned side */
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A int i;
2N/A mdname_t *np;
2N/A mddrivename_t *dnp;
2N/A char *devidstr = NULL;
2N/A int devidstrlen;
2N/A md_dev64_t retdev = NODEV64;
2N/A char *ret_devname = NULL;
2N/A char *ret_blkdevname = NULL;
2N/A char *ret_driver = NULL;
2N/A char *nodename;
2N/A int fd;
2N/A int ret = -1;
2N/A char *minor_name = NULL;
2N/A md_mnnode_desc *nd;
2N/A
2N/A
2N/A if (ret_bname != NULL)
2N/A *ret_bname = NULL;
2N/A if (ret_dname != NULL)
2N/A *ret_dname = NULL;
2N/A if (ret_mnum != NULL)
2N/A *ret_mnum = NODEV32;
2N/A
2N/A if (metaislocalset(sp)) {
2N/A /* no more sides - we are done */
2N/A if (*sideno != MD_SIDEWILD)
2N/A return (0);
2N/A
2N/A /* First time through - set up return sideno */
2N/A *sideno = 0;
2N/A } else {
2N/A
2N/A /*
2N/A * Find the next sideno, starting after the one given.
2N/A */
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A if (MD_MNSET_DESC(sd)) {
2N/A nd = sd->sd_nodelist;
2N/A if ((*sideno == MD_SIDEWILD) &&
2N/A (nd != (struct md_mnnode_desc *)NULL)) {
2N/A *sideno = nd->nd_nodeid;
2N/A } else {
2N/A while (nd) {
2N/A /*
2N/A * Found given sideno, now find
2N/A * next sideno, if there is one.
2N/A */
2N/A if ((*sideno == nd->nd_nodeid) &&
2N/A (nd->nd_next !=
2N/A (struct md_mnnode_desc *)NULL)) {
2N/A *sideno =
2N/A nd->nd_next->nd_nodeid;
2N/A break;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A if (nd == NULL) {
2N/A return (0);
2N/A }
2N/A }
2N/A
2N/A if (*sideno == MD_SIDEWILD)
2N/A return (0);
2N/A
2N/A nodename = (char *)nd->nd_nodename;
2N/A } else {
2N/A for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
2N/A /* Find next full slot */
2N/A if (sd->sd_nodes[i][0] != '\0')
2N/A break;
2N/A
2N/A /* No more sides - we are done */
2N/A if (i == MD_MAXSIDES)
2N/A return (0);
2N/A
2N/A /* Set up the return sideno */
2N/A *sideno = i;
2N/A nodename = (char *)sd->sd_nodes[i];
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Need to pass the node the devid of the disk and get it to
2N/A * send back the details of the disk from that side.
2N/A */
2N/A if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A dnp = np->drivenamep;
2N/A
2N/A /*
2N/A * By default, set up the parameters so that they are copied out.
2N/A */
2N/A if (ret_bname != NULL)
2N/A *ret_bname = Strdup(np->bname);
2N/A
2N/A if (ret_dname != NULL) {
2N/A mdcinfo_t *cinfo;
2N/A
2N/A if ((cinfo = metagetcinfo(np, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A *ret_dname = Strdup(cinfo->dname);
2N/A }
2N/A
2N/A if (ret_mnum != NULL)
2N/A *ret_mnum = meta_getminor(np->dev);
2N/A
2N/A /*
2N/A * Try some optimization. If this is the local set or the device
2N/A * is a metadevice then just copy the information. If the device
2N/A * does not have a devid (due to not having a minor name) then
2N/A * fall back to the pre-devid behaviour of copying the information
2N/A * on the device: this is okay because the sanity checks before this
2N/A * call would have found any issues with the device.
2N/A */
2N/A if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL))
2N/A return (1);
2N/A
2N/A if (np->minor_name == (char *)NULL) {
2N/A /*
2N/A * Have to get the minor name then. The slice should exist
2N/A * on the disk because it will have already been repartitioned
2N/A * up prior to getting to this point.
2N/A */
2N/A if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
2N/A (void) mdsyserror(ep, errno, np->bname);
2N/A return (-1);
2N/A }
2N/A (void) devid_get_minor_name(fd, &minor_name);
2N/A np->minor_name = Strdup(minor_name);
2N/A devid_str_free(minor_name);
2N/A (void) close(fd);
2N/A }
2N/A
2N/A /* allocate extra space for "/" and NULL hence +2 */
2N/A devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
2N/A devidstr = (char *)Malloc(devidstrlen);
2N/A
2N/A /*
2N/A * As a minor name is supplied then the ret_devname will be
2N/A * appropriate to that minor_name and in this case it will be
2N/A * a block device ie /dev/dsk.
2N/A */
2N/A (void) snprintf(devidstr, devidstrlen,
2N/A "%s/%s", dnp->devid, np->minor_name);
2N/A
2N/A ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
2N/A np->bname, &ret_devname, &ret_driver, ep);
2N/A
2N/A Free(devidstr);
2N/A
2N/A /*
2N/A * If the other side is not running device id in disksets,
2N/A * 'ret' is set to ENOTSUP in which case we fallback to
2N/A * the existing behaviour
2N/A */
2N/A if (ret == ENOTSUP)
2N/A return (1);
2N/A else if (ret == -1)
2N/A return (-1);
2N/A
2N/A /*
2N/A * ret_devname comes from the rpc call and is a
2N/A * raw device name. We need to make this into a
2N/A * block device via blkname for further processing.
2N/A * Unfortunately, when our device id isn't found in
2N/A * the system, the rpc call will return a " " in
2N/A * ret_devname in which case we need to fill that in
2N/A * as ret_blkname because blkname of " " returns NULL.
2N/A */
2N/A if (ret_bname != NULL && ret_devname != NULL) {
2N/A ret_blkdevname = blkname(ret_devname);
2N/A if (ret_blkdevname == NULL)
2N/A *ret_bname = Strdup(ret_devname);
2N/A else
2N/A *ret_bname = Strdup(ret_blkdevname);
2N/A }
2N/A
2N/A if (ret_dname != NULL && ret_driver != NULL)
2N/A *ret_dname = Strdup(ret_driver);
2N/A
2N/A if (ret_mnum != NULL)
2N/A *ret_mnum = meta_getminor(retdev);
2N/A
2N/A return (1);
2N/A}
2N/A
2N/Aint
2N/Ameta_is_drive_in_anyset(
2N/A mddrivename_t *dnp,
2N/A mdsetname_t **spp,
2N/A int bypass_daemon,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A set_t setno;
2N/A mdsetname_t *this_sp;
2N/A int is_it;
2N/A set_t max_sets;
2N/A
2N/A if ((max_sets = get_max_sets(ep)) == 0)
2N/A return (-1);
2N/A
2N/A assert(spp != NULL);
2N/A *spp = NULL;
2N/A
2N/A for (setno = 1; setno < max_sets; setno++) {
2N/A if (!bypass_daemon) {
2N/A if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
2N/A if (mdismddberror(ep, MDE_DB_NODB)) {
2N/A mdclrerror(ep);
2N/A return (0);
2N/A }
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A mdclrerror(ep);
2N/A continue;
2N/A }
2N/A return (-1);
2N/A }
2N/A } else
2N/A this_sp = metafakesetname(setno, NULL);
2N/A
2N/A if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
2N/A bypass_daemon, ep)) == -1) {
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A mdclrerror(ep);
2N/A continue;
2N/A }
2N/A return (-1);
2N/A }
2N/A if (is_it) {
2N/A *spp = this_sp;
2N/A return (0);
2N/A }
2N/A }
2N/A return (0);
2N/A}
2N/A
2N/Aint
2N/Ameta_is_drive_in_thisset(
2N/A mdsetname_t *sp,
2N/A mddrivename_t *dnp,
2N/A int bypass_daemon,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_drive_desc *dd, *p;
2N/A
2N/A if (bypass_daemon)
2N/A dd = dr2drivedesc(sp, MD_SIDEWILD,
2N/A (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
2N/A else
2N/A dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
2N/A
2N/A if (dd == NULL) {
2N/A if (! mdisok(ep))
2N/A return (-1);
2N/A return (0);
2N/A }
2N/A
2N/A
2N/A for (p = dd; p != NULL; p = p->dd_next)
2N/A if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
2N/A return (1);
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * Check to see if devid is in use in any diskset.
2N/A * This is used in the case when a partial diskset is being imported
2N/A * to make sure that the unvailable drive isn't already in use in an
2N/A * already imported partial diskset. Can't check on the cname since the
2N/A * unavailable disk's cname is from the previous system and may collide
2N/A * with a cname on this system.
2N/A * Return values:
2N/A * 1: devid has been found in a diskset
2N/A * 0: devid not found in any diskset
2N/A */
2N/Aint
2N/Ameta_is_devid_in_anyset(
2N/A void *devid,
2N/A mdsetname_t **spp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A set_t setno;
2N/A mdsetname_t *this_sp;
2N/A int is_it;
2N/A set_t max_sets;
2N/A
2N/A if ((max_sets = get_max_sets(ep)) == 0)
2N/A return (-1);
2N/A
2N/A assert(spp != NULL);
2N/A *spp = NULL;
2N/A
2N/A for (setno = 1; setno < max_sets; setno++) {
2N/A if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
2N/A if (mdismddberror(ep, MDE_DB_NODB)) {
2N/A mdclrerror(ep);
2N/A return (0);
2N/A }
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A mdclrerror(ep);
2N/A continue;
2N/A }
2N/A return (-1);
2N/A }
2N/A
2N/A if ((is_it = meta_is_devid_in_thisset(this_sp,
2N/A devid, ep)) == -1) {
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A mdclrerror(ep);
2N/A continue;
2N/A }
2N/A return (-1);
2N/A }
2N/A if (is_it) {
2N/A *spp = this_sp;
2N/A return (0);
2N/A }
2N/A }
2N/A return (0);
2N/A}
2N/A
2N/Aint
2N/Ameta_is_devid_in_thisset(
2N/A mdsetname_t *sp,
2N/A void *devid,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_drive_desc *dd, *p;
2N/A ddi_devid_t dd_devid;
2N/A
2N/A dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
2N/A if (dd == NULL) {
2N/A if (! mdisok(ep))
2N/A return (-1);
2N/A return (0);
2N/A }
2N/A
2N/A for (p = dd; p != NULL; p = p->dd_next) {
2N/A if (p->dd_dnp->devid == NULL)
2N/A continue;
2N/A (void) devid_str_decode(p->dd_dnp->devid,
2N/A &dd_devid, NULL);
2N/A if (dd_devid == NULL)
2N/A continue;
2N/A if (devid_compare(devid, dd_devid) == 0) {
2N/A devid_free(dd_devid);
2N/A return (1);
2N/A }
2N/A devid_free(dd_devid);
2N/A }
2N/A return (0);
2N/A}
2N/A
2N/Aint
2N/Ameta_set_balance(
2N/A mdsetname_t *sp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A md_drive_desc *dd, *curdd;
2N/A daddr_t dbsize;
2N/A daddr_t nblks;
2N/A int i;
2N/A int rval = 0;
2N/A sigset_t oldsigs;
2N/A md_setkey_t *cl_sk;
2N/A md_error_t xep = mdnullerror;
2N/A md_mnnode_desc *nd;
2N/A int suspend1_flag = 0;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
2N/A
2N/A /* Make sure we own the set */
2N/A if (meta_check_ownership(sp, ep) != 0)
2N/A return (-1);
2N/A
2N/A /* END CHECK CODE */
2N/A
2N/A /*
2N/A * Get drive descriptors for the drives that are currently in the set.
2N/A */
2N/A curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
2N/A
2N/A if (! mdisok(ep))
2N/A return (-1);
2N/A
2N/A /* Find the minimum replica size in use is or use the default */
2N/A if ((nblks = meta_db_minreplica(sp, ep)) < 0)
2N/A mdclrerror(ep);
2N/A else
2N/A dbsize = nblks; /* adjust replica size */
2N/A
2N/A /* Make sure we are blocking all signals */
2N/A if (procsigs(TRUE, &oldsigs, &xep) < 0)
2N/A mdclrerror(&xep);
2N/A
2N/A /*
2N/A * Lock the set on current set members.
2N/A * For MN diskset lock_set and SUSPEND are used to protect against
2N/A * other meta* commands running on the other nodes.
2N/A */
2N/A if (MD_MNSET_DESC(sd)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A /*
2N/A * Lock out other meta* commands by suspending
2N/A * class 1 messages across the diskset.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mdcommdctl(nd->nd_nodename,
2N/A COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2N/A MD_MSCF_NO_FLAGS, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A suspend1_flag = 1;
2N/A nd = nd->nd_next;
2N/A }
2N/A } else {
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0') continue;
2N/A
2N/A if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A }
2N/A
2N/A /* We are not adding or deleting any drives, just balancing */
2N/A dd = NULL;
2N/A
2N/A /*
2N/A * Balance the DB's according to the list of existing drives and the
2N/A * list of added drives.
2N/A */
2N/A if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
2N/A goto out;
2N/A
2N/Aout:
2N/A /*
2N/A * Unlock diskset by resuming class 1 messages across the diskset.
2N/A * Just resume all classes so that resume is the same whether
2N/A * just one class was locked or all classes were locked.
2N/A */
2N/A if (suspend1_flag) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2N/A sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2N/A /*
2N/A * We are here because we failed to resume
2N/A * rpc.mdcommd. However we potentially have
2N/A * an error from the previous call
2N/A * (meta_db_balance). If the previous call
2N/A * did fail, we capture that error and
2N/A * generate a perror withthe string,
2N/A * "Unable to resume...".
2N/A * Setting rval to -1 ensures that in the
2N/A * next iteration of the loop, ep is not
2N/A * clobbered.
2N/A */
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to resume rpc.mdcommd."));
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A }
2N/A
2N/A /* Unlock the set */
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A if (MD_MNSET_DESC(sd)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A } else {
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A rval = -1;
2N/A }
2N/A }
2N/A }
2N/A
2N/A /* release signals back to what they were on entry */
2N/A if (procsigs(FALSE, &oldsigs, &xep) < 0)
2N/A mdclrerror(&xep);
2N/A
2N/A cl_set_setkey(NULL);
2N/A
2N/A metaflushsetname(sp);
2N/A
2N/A return (rval);
2N/A}
2N/A
2N/Aint
2N/Ameta_set_destroy(
2N/A mdsetname_t *sp,
2N/A int lock_set,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A int i;
2N/A med_rec_t medr;
2N/A md_set_desc *sd;
2N/A md_drive_desc *dd, *p, *p1;
2N/A mddrivename_t *dnp;
2N/A mdname_t *np;
2N/A mdnamelist_t *nlp = NULL;
2N/A int num_users = 0;
2N/A int has_set;
2N/A side_t mysideno;
2N/A sigset_t oldsigs;
2N/A md_error_t xep = mdnullerror;
2N/A md_setkey_t *cl_sk;
2N/A int rval = 0;
2N/A int delete_end = 1;
2N/A
2N/A /* Make sure we are blocking all signals */
2N/A if (procsigs(TRUE, &oldsigs, ep) < 0)
2N/A return (-1);
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A if (! mdisok(ep))
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /*
2N/A * meta_set_destroy should not be called for a MN diskset.
2N/A * This routine destroys a set without communicating this information
2N/A * to the other nodes which would lead to an inconsistency in
2N/A * the MN diskset.
2N/A */
2N/A if (MD_MNSET_DESC(sd)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* Continue if a traditional diskset */
2N/A
2N/A /*
2N/A * Check to see who has the set. If we are not the last user of the
2N/A * set, we will not touch the replicas.
2N/A */
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
2N/A ep);
2N/A
2N/A if (has_set < 0) {
2N/A mdclrerror(ep);
2N/A } else
2N/A num_users++;
2N/A }
2N/A
2N/A if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
2N/A if (! mdisok(ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A if (lock_set == TRUE) {
2N/A /* Lock the set on our side */
2N/A if (clnt_lock_set(mynode(), sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * A traditional diskset has no diskset stale information to send
2N/A * since there can only be one owner node at a time.
2N/A */
2N/A if (snarf_set(sp, FALSE, ep))
2N/A mdclrerror(ep);
2N/A
2N/A if (dd != NULL) {
2N/A /*
2N/A * Make sure that no drives are in use as parts of metadrives
2N/A * or hot spare pools, this is one of the few error conditions
2N/A * that will stop this routine, unless the environment has
2N/A * META_DESTROY_SET_OK set, in which case, the operation will
2N/A * proceed.
2N/A */
2N/A if (getenv("META_DESTROY_SET_OK") == NULL) {
2N/A for (p = dd; p != NULL; p = p->dd_next) {
2N/A dnp = p->dd_dnp;
2N/A
2N/A i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
2N/A if (i == -1) {
2N/A /* need xep - wire calls clear error */
2N/A i = metaget_setownership(sp, &xep);
2N/A if (i == -1) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A mysideno = getmyside(sp, &xep);
2N/A
2N/A if (mysideno == MD_SIDEWILD) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A if (sd->sd_isown[mysideno] == FALSE)
2N/A if (halt_set(sp, &xep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A }
2N/A
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A /* Skip non local nodes */
2N/A if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
2N/A continue;
2N/A
2N/A if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
2N/A mdclrerror(ep);
2N/A }
2N/A
2N/A /*
2N/A * Go thru each drive and individually delete the replicas.
2N/A * This way we can ignore individual errors.
2N/A */
2N/A for (p = dd; p != NULL; p = p->dd_next) {
2N/A uint_t rep_slice;
2N/A
2N/A dnp = p->dd_dnp;
2N/A if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
2N/A (((np = metaslicename(dnp, rep_slice, ep))
2N/A == NULL) &&
2N/A ((np = metaslicename(dnp, MD_SLICE0, ep))
2N/A == NULL))) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A if ((np = metaslicename(dnp,
2N/A rep_slice, ep)) == NULL) {
2N/A if ((np = metaslicename(dnp,
2N/A MD_SLICE0, ep)) == NULL) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A mdclrerror(ep);
2N/A }
2N/A
2N/A /* Yes this is UGLY!!! */
2N/A p1 = p->dd_next;
2N/A p->dd_next = NULL;
2N/A if (rel_own_bydd(sp, p, FALSE, ep))
2N/A mdclrerror(ep);
2N/A p->dd_next = p1;
2N/A
2N/A if (p->dd_dbcnt == 0)
2N/A continue;
2N/A
2N/A /*
2N/A * Skip the replica removal if we are not the last user
2N/A */
2N/A if (num_users != 1)
2N/A continue;
2N/A
2N/A nlp = NULL;
2N/A (void) metanamelist_append(&nlp, np);
2N/A if (meta_db_detach(sp, nlp,
2N/A (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
2N/A mdclrerror(ep);
2N/A metafreenamelist(nlp);
2N/A }
2N/A }
2N/A
2N/A if (halt_set(sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* Setup the mediator record */
2N/A (void) memset(&medr, '\0', sizeof (med_rec_t));
2N/A medr.med_rec_mag = MED_REC_MAGIC;
2N/A medr.med_rec_rev = MED_REC_REV;
2N/A medr.med_rec_fl = 0;
2N/A medr.med_rec_sn = sp->setno;
2N/A (void) strcpy(medr.med_rec_snm, sp->setname);
2N/A medr.med_rec_meds = sd->sd_med; /* structure assigment */
2N/A (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
2N/A medr.med_rec_foff = 0;
2N/A
2N/A /*
2N/A * If we are the last remaining user, then remove the mediator hosts
2N/A */
2N/A if (num_users == 1) {
2N/A for (i = 0; i < MED_MAX_HOSTS; i++) {
2N/A if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
2N/A SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
2N/A SVM_TAG_MEDIATOR, sp->setno, i);
2N/A (void) memset(&medr.med_rec_meds.n_lst[i], '\0',
2N/A sizeof (md_h_t));
2N/A }
2N/A medr.med_rec_meds.n_cnt = 0;
2N/A } else { /* Remove this host from the mediator node list. */
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A /* Copy non local node */
2N/A if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
2N/A (void) strcpy(medr.med_rec_nodes[i],
2N/A sd->sd_nodes[i]);
2N/A continue;
2N/A }
2N/A
2N/A /* Clear local node */
2N/A (void) memset(&medr.med_rec_nodes[i], '\0',
2N/A sizeof (md_node_nm_t));
2N/A }
2N/A }
2N/A
2N/A crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
2N/A
2N/A /*
2N/A * If the client is part of a cluster put the DCS service
2N/A * into a deleteing state.
2N/A */
2N/A if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
2N/A if (metad_isautotakebyname(sp->setname)) {
2N/A delete_end = 0;
2N/A } else {
2N/A mdclrerror(ep);
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /* Inform the mediator hosts of the new information */
2N/A for (i = 0; i < MED_MAX_HOSTS; i++) {
2N/A if (sd->sd_med.n_lst[i].a_cnt == 0)
2N/A continue;
2N/A
2N/A if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
2N/A mdclrerror(ep);
2N/A }
2N/A
2N/A /* Delete the set locally */
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A /* Skip non local nodes */
2N/A if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
2N/A continue;
2N/A
2N/A if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
2N/A mdclrerror(ep);
2N/A }
2N/A if (delete_end &&
2N/A sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
2N/A rval = -1;
2N/A
2N/Aout:
2N/A /* release signals back to what they were on entry */
2N/A if (procsigs(FALSE, &oldsigs, &xep) < 0) {
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A rval = -1;
2N/A }
2N/A
2N/A if (lock_set == TRUE) {
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A rval = -1;
2N/A }
2N/A cl_set_setkey(NULL);
2N/A }
2N/A
2N/A metaflushsetname(sp);
2N/A return (rval);
2N/A}
2N/A
2N/Aint
2N/Ameta_set_purge(
2N/A mdsetname_t *sp,
2N/A int bypass_cluster,
2N/A int forceflg,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A char *thishost = mynode();
2N/A md_set_desc *sd;
2N/A md_setkey_t *cl_sk;
2N/A md_error_t xep = mdnullerror;
2N/A int rval = 0;
2N/A int i, num_hosts = 0;
2N/A int has_set = 0;
2N/A int max_node = 0;
2N/A int delete_end = 1;
2N/A md_mnnode_desc *nd;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A /* unable to find set description */
2N/A rval = 1;
2N/A return (rval);
2N/A }
2N/A
2N/A if (MD_MNSET_DESC(sd)) {
2N/A /*
2N/A * Get a count of the hosts in the set and also lock the set
2N/A * on those hosts that know about it.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /*
2N/A * Only deal with those nodes that are members of
2N/A * the set (MD_MN_NODE_ALIVE) or the node on which
2N/A * the purge is being run. We must lock the set
2N/A * on the purging node because the delset call
2N/A * requires the lock to be set.
2N/A */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE) &&
2N/A nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A has_set = nodehasset(sp, nd->nd_nodename,
2N/A NHS_NST_EQ, ep);
2N/A
2N/A /*
2N/A * The host is not aware of this set (has_set < 0) or
2N/A * the set does not match (has_set == 0). This check
2N/A * prevents the code getting confused by an apparent
2N/A * inconsistancy in the set's state, this is in the
2N/A * purge code so something is broken in any case and
2N/A * this is just trying to fix the brokeness.
2N/A */
2N/A if (has_set <= 0) {
2N/A mdclrerror(ep);
2N/A nd->nd_flags |= MD_MN_NODE_NOSET;
2N/A } else {
2N/A num_hosts++;
2N/A if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2N/A /*
2N/A * If the force flag is set then
2N/A * ignore any RPC failures because we
2N/A * are only really interested with
2N/A * the set on local node.
2N/A */
2N/A if (forceflg && mdanyrpcerror(ep)) {
2N/A mdclrerror(ep);
2N/A } else {
2N/A /*
2N/A * set max_node so that in the
2N/A * unlock code nodes in the
2N/A * set that have not been
2N/A * locked are not unlocked.
2N/A */
2N/A max_node = nd->nd_nodeid;
2N/A rval = 2;
2N/A goto out1;
2N/A }
2N/A }
2N/A
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A max_node = 0;
2N/A } else {
2N/A /*
2N/A * Get a count of the hosts in the set and also lock the set
2N/A * on those hosts that know about it.
2N/A */
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A has_set = nodehasset(sp, sd->sd_nodes[i],
2N/A NHS_NST_EQ, ep);
2N/A
2N/A /*
2N/A * The host is not aware of this set (has_set < 0) or
2N/A * the set does not match (has_set == 0). This check
2N/A * prevents the code getting confused by an apparent
2N/A * inconsistancy in the set's state, this is in the
2N/A * purge code so something is broken in any case and
2N/A * this is just trying to fix the brokeness.
2N/A */
2N/A if (has_set <= 0) {
2N/A mdclrerror(ep);
2N/A /*
2N/A * set the node to NULL to prevent further
2N/A * requests to this unresponsive node.
2N/A */
2N/A sd->sd_nodes[i][0] = '\0';
2N/A } else {
2N/A num_hosts++;
2N/A if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
2N/A /*
2N/A * If the force flag is set then
2N/A * ignore any RPC failures because we
2N/A * are only really interested with
2N/A * the set on local node.
2N/A */
2N/A if (forceflg && mdanyrpcerror(ep)) {
2N/A mdclrerror(ep);
2N/A } else {
2N/A rval = 2;
2N/A /*
2N/A * set max_node so that in the
2N/A * unlock code nodes in the
2N/A * set that have not been
2N/A * locked are not unlocked.
2N/A */
2N/A max_node = i;
2N/A goto out1;
2N/A }
2N/A }
2N/A }
2N/A }
2N/A max_node = i; /* now MD_MAXSIDES */
2N/A }
2N/A if (!bypass_cluster) {
2N/A /*
2N/A * If there is only one host associated with the
2N/A * set then remove the set from the cluster.
2N/A */
2N/A if (num_hosts == 1) {
2N/A if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
2N/A if (metad_isautotakebyname(sp->setname)) {
2N/A delete_end = 0;
2N/A } else {
2N/A mdclrerror(ep);
2N/A rval = 3;
2N/A goto out1;
2N/A }
2N/A }
2N/A }
2N/A }
2N/A
2N/A if (MD_MNSET_DESC(sd)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) {
2N/A /*
2N/A * This is the node on which the purge is
2N/A * being run. We do not care if it is
2N/A * alive or not, just want to get rid of
2N/A * the set.
2N/A */
2N/A if (clnt_delset(nd->nd_nodename, sp,
2N/A ep) == -1) {
2N/A md_perror(dgettext(TEXT_DOMAIN,
2N/A "delset"));
2N/A if (!bypass_cluster && num_hosts == 1)
2N/A (void) sdssc_delete_end(
2N/A sp->setname, SDSSC_CLEANUP);
2N/A mdclrerror(ep);
2N/A goto out1;
2N/A }
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /*
2N/A * Only contact those nodes that are members of
2N/A * the set.
2N/A */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /*
2N/A * Tell the remote node to remove this node
2N/A */
2N/A if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost,
2N/A ep) == -1) {
2N/A /*
2N/A * If we fail to delete ourselves
2N/A * from the remote host it does not
2N/A * really matter because the set is
2N/A * being "purged" from this node. The
2N/A * set can be purged from the other
2N/A * node at a later time.
2N/A */
2N/A mdclrerror(ep);
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A } else {
2N/A for (i = 0; i < MD_MAXSIDES; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
2N/A /*
2N/A * Tell the remote node to remove this node
2N/A */
2N/A if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
2N/A &thishost, ep) == -1) {
2N/A /*
2N/A * If we fail to delete ourselves
2N/A * from the remote host it does not
2N/A * really matter because the set is
2N/A * being "purged" from this node. The
2N/A * set can be purged from the other
2N/A * node at a later time.
2N/A */
2N/A mdclrerror(ep);
2N/A }
2N/A continue;
2N/A }
2N/A
2N/A /* remove the set from this host */
2N/A if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
2N/A md_perror(dgettext(TEXT_DOMAIN, "delset"));
2N/A if (!bypass_cluster && num_hosts == 1)
2N/A (void) sdssc_delete_end(sp->setname,
2N/A SDSSC_CLEANUP);
2N/A mdclrerror(ep);
2N/A goto out1;
2N/A }
2N/A }
2N/A }
2N/A
2N/A if (!bypass_cluster && num_hosts == 1) {
2N/A if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
2N/A SDSSC_ERROR) {
2N/A rval = 4;
2N/A }
2N/A }
2N/A
2N/Aout1:
2N/A
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A
2N/A /*
2N/A * Remove the set lock on those nodes that had the set locked
2N/A * max_node will either be MD_MAXSIDES or array index of the last
2N/A * node contacted (or rather failed to contact) for traditional
2N/A * diskset. For a MN diskset, max_node is the node_id of the node
2N/A * that failed the lock.
2N/A */
2N/A if (MD_MNSET_DESC(sd)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (nd->nd_nodeid == max_node)
2N/A break;
2N/A if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2N/A if (forceflg && mdanyrpcerror(&xep)) {
2N/A mdclrerror(&xep);
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A rval = 5;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A } else {
2N/A for (i = 0; i < max_node; i++) {
2N/A /* Skip empty slots */
2N/A if (sd->sd_nodes[i][0] == '\0')
2N/A continue;
2N/A
2N/A if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
2N/A if (forceflg && mdanyrpcerror(&xep)) {
2N/A mdclrerror(&xep);
2N/A continue;
2N/A }
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A rval = 5;
2N/A }
2N/A }
2N/A }
2N/A
2N/A cl_set_setkey(NULL);
2N/A
2N/A return (rval);
2N/A}
2N/A
2N/Aint
2N/Ameta_set_query(
2N/A mdsetname_t *sp,
2N/A mddb_dtag_lst_t **dtlpp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A mddb_dtag_get_parm_t dtgp;
2N/A
2N/A (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
2N/A dtgp.dtgp_setno = sp->setno;
2N/A
2N/A /*CONSTCOND*/
2N/A while (1) {
2N/A if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
2N/A if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
2N/A *dtlpp == NULL)
2N/A return (mdstealerror(ep, &dtgp.dtgp_mde));
2N/A else
2N/A break;
2N/A
2N/A /*
2N/A * Run to the end of the list
2N/A */
2N/A for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
2N/A /* void */;
2N/A
2N/A *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
2N/A
2N/A (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
2N/A sizeof (mddb_dtag_t));
2N/A
2N/A dtgp.dtgp_dt.dt_id++;
2N/A }
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * return drivename get by key
2N/A */
2N/Amddrivename_t *
2N/Ametadrivename_withdrkey(
2N/A mdsetname_t *sp,
2N/A side_t sideno,
2N/A mdkey_t key,
2N/A int flags,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A char *nm;
2N/A mdname_t *np;
2N/A mddrivename_t *dnp;
2N/A ddi_devid_t devidp;
2N/A md_set_desc *sd;
2N/A int clboot = 0;
2N/A int ret = -1;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A return (NULL);
2N/A }
2N/A
2N/A if (_cladm(CL_INITIALIZE, CL_GET_BOOTFLAG, &clboot) != 0) {
2N/A return (NULL);
2N/A }
2N/A
2N/A /*
2N/A * Get the devid associated with the key.
2N/A *
2N/A * If a devid was returned, it MUST be valid even in
2N/A * the case where a device id has been "updated". The
2N/A * "update" of the device id may have occured due to
2N/A * a firmware upgrade.
2N/A */
2N/A if (MD_MNSET_DESC(sd)) {
2N/A devidp = meta_getdidbykey(MD_LOCAL_SET, sideno, key, ep);
2N/A } else {
2N/A devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep);
2N/A }
2N/A
2N/A if (devidp != NULL) {
2N/A /*
2N/A * Look for the correct dnp using the devid for comparison.
2N/A */
2N/A dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
2N/A free(devidp);
2N/A
2N/A /* dnp could be NULL if the devid could not be decoded. */
2N/A if (dnp == NULL) {
2N/A return (NULL);
2N/A }
2N/A dnp->side_names_key = key;
2N/A } else {
2N/A /*
2N/A * We didn't get a devid. We'll try for a dnp using the
2N/A * name. If we have a MN diskset or if the dnp is a did
2N/A * device, we're done because then we don't have devids.
2N/A * Otherwise we'll try to set the devid
2N/A * and get the dnp via devid again.
2N/A * We also need to clear the ep structure. When the
2N/A * above call to meta_getdidbykey returned a null, it
2N/A * also put an error code into ep. In this case, the null
2N/A * return is actually OK and any errors can be ignored. The
2N/A * reason it is OK is because this could be a MN set or
2N/A * we could be running without devids (ex cluster).
2N/A */
2N/A mdclrerror(ep);
2N/A
2N/A if (MD_MNSET_DESC(sd)) {
2N/A nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key, ep);
2N/A } else {
2N/A nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW, key,
2N/A ep);
2N/A }
2N/A if (nm == NULL)
2N/A return (NULL);
2N/A
2N/A /* get device name */
2N/A if (flags & PRINT_FAST) {
2N/A if ((np = metaname_fast(&sp, nm,
2N/A LOGICAL_DEVICE, ep)) == NULL) {
2N/A Free(nm);
2N/A return (NULL);
2N/A }
2N/A } else {
2N/A if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
2N/A ep)) == NULL) {
2N/A Free(nm);
2N/A return (NULL);
2N/A }
2N/A }
2N/A Free(nm);
2N/A /* make sure it's OK */
2N/A if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
2N/A ep) != 0))
2N/A return (NULL);
2N/A
2N/A /* get drivename */
2N/A dnp = np->drivenamep;
2N/A dnp->side_names_key = key;
2N/A /*
2N/A * Skip the following devid set/check if dnp is a did device
2N/A * and the Sun Cluster did driver does not support devids.
2N/A */
2N/A if (clboot & CLUSTER_CONFIGURED &&
2N/A meta_devid_supported_in_did() == MD_DEVID_NOT_SUPPORTED) {
2N/A goto out;
2N/A }
2N/A
2N/A /*
2N/A * It is okay if replica is not in devid mode
2N/A */
2N/A if (mdissyserror(ep, MDDB_F_NODEVID)) {
2N/A mdclrerror(ep);
2N/A goto out;
2N/A }
2N/A
2N/A /*
2N/A * The devid is missing. This means that we have just
2N/A * upgraded from a configuration where devids were not
2N/A * used so try to add in the devid and requery. If the
2N/A * devid still isn't there, that's OK. dnp->devid will
2N/A * be null as it is in any configuration with no devids.
2N/A */
2N/A
2N/A if (MD_MNSET_DESC(sd)) {
2N/A ret = meta_setdid(MD_LOCAL_SET, sideno, key, ep);
2N/A } else {
2N/A ret = meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep);
2N/A }
2N/A if (ret < 0)
2N/A return (NULL);
2N/A
2N/A if (MD_MNSET_DESC(sd)) {
2N/A devidp = meta_getdidbykey(MD_LOCAL_SET, sideno, key,
2N/A ep);
2N/A } else {
2N/A devidp = meta_getdidbykey(MD_LOCAL_SET, sideno + SKEW,
2N/A key, ep);
2N/A }
2N/A
2N/A if (devidp != NULL) {
2N/A /*
2N/A * Found a devid so look for the dnp using the
2N/A * devid as the search mechanism.
2N/A */
2N/A dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
2N/A free(devidp);
2N/A if (dnp == NULL) {
2N/A return (NULL);
2N/A }
2N/A dnp->side_names_key = key;
2N/A }
2N/A }
2N/A
2N/A
2N/A
2N/Aout:
2N/A if (flags & MD_BYPASS_DAEMON)
2N/A return (dnp);
2N/A
2N/A if (get_sidenmlist(sp, dnp, ep))
2N/A return (NULL);
2N/A
2N/A /* return success */
2N/A return (dnp);
2N/A}
2N/A
2N/Avoid
2N/Ametafreedrivedesc(md_drive_desc **dd)
2N/A{
2N/A md_drive_desc *p, *next = NULL;
2N/A
2N/A for (p = *dd; p != NULL; p = next) {
2N/A next = p->dd_next;
2N/A Free(p);
2N/A }
2N/A *dd = NULL;
2N/A}
2N/A
2N/Amd_drive_desc *
2N/Ametaget_drivedesc(
2N/A mdsetname_t *sp,
2N/A int flags,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A side_t sideno = MD_SIDEWILD;
2N/A
2N/A assert(! (flags & MD_BYPASS_DAEMON));
2N/A
2N/A if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
2N/A return (NULL);
2N/A
2N/A return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
2N/A}
2N/A
2N/Amd_drive_desc *
2N/Ametaget_drivedesc_fromnamelist(
2N/A mdsetname_t *sp,
2N/A mdnamelist_t *nlp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A mdnamelist_t *p;
2N/A md_drive_desc *dd = NULL;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (NULL);
2N/A
2N/A for (p = nlp; p != NULL; p = p->next)
2N/A (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
2N/A sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
2N/A
2N/A return (dd);
2N/A}
2N/A
2N/Amd_drive_desc *
2N/Ametaget_drivedesc_sideno(
2N/A mdsetname_t *sp,
2N/A side_t sideno,
2N/A int flags,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd = NULL;
2N/A
2N/A assert(! (flags & MD_BYPASS_DAEMON));
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (NULL);
2N/A
2N/A if (sd->sd_drvs)
2N/A return (sd->sd_drvs);
2N/A
2N/A if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
2N/A return (NULL);
2N/A
2N/A return (sd->sd_drvs);
2N/A}
2N/A
2N/Aint
2N/Ametaget_setownership(
2N/A mdsetname_t *sp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A int bool;
2N/A int i;
2N/A md_mnnode_desc *nd;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL)
2N/A return (-1);
2N/A
2N/A if (MD_MNSET_DESC(sd)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* If node isn't alive, can't own diskset */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd->nd_flags &= ~MD_MN_NODE_OWN;
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A /*
2N/A * If can't communicate with rpc.metad, then mark
2N/A * this node as not an owner. That node may
2N/A * in fact, be an owner, but without rpc.metad running
2N/A * that node can't do much.
2N/A */
2N/A if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
2N/A nd->nd_flags &= ~MD_MN_NODE_OWN;
2N/A } else if (bool == TRUE) {
2N/A nd->nd_flags |= MD_MN_NODE_OWN;
2N/A } else {
2N/A nd->nd_flags &= ~MD_MN_NODE_OWN;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A return (0);
2N/A }
2N/A
2N/A /* Rest of code handles traditional disksets */
2N/A
2N/A for (i = 0; i < MD_MAXSIDES; i++)
2N/A sd->sd_isown[i] = 0;
2N/A
2N/A if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
2N/A return (-1);
2N/A
2N/A if (bool == TRUE)
2N/A sd->sd_isown[getmyside(sp, ep)] = 1;
2N/A
2N/A return (0);
2N/A}
2N/A
2N/Achar *
2N/Amynode(void)
2N/A{
2N/A static struct utsname myuname;
2N/A static int done = 0;
2N/A
2N/A if (! done) {
2N/A if (uname(&myuname) == -1) {
2N/A md_perror(dgettext(TEXT_DOMAIN, "uname"));
2N/A assert(0);
2N/A }
2N/A done = 1;
2N/A }
2N/A return (myuname.nodename);
2N/A}
2N/A
2N/Aint
2N/Astrinlst(char *str, int cnt, char **lst)
2N/A{
2N/A int i;
2N/A
2N/A for (i = 0; i < cnt; i++)
2N/A if (strcmp(lst[i], str) == 0)
2N/A return (TRUE);
2N/A
2N/A return (FALSE);
2N/A}
2N/A
2N/A/*
2N/A * meta_get_reserved_names
2N/A * returns an mdnamelist_t of reserved slices
2N/A * reserved slices are those that are used but don't necessarily
2N/A * show up as metadevices (ex. reserved slice for db in sets, logs)
2N/A */
2N/A
2N/A/*ARGSUSED*/
2N/Aint
2N/Ameta_get_reserved_names(
2N/A mdsetname_t *sp,
2N/A mdnamelist_t **nlpp,
2N/A int options,
2N/A md_error_t *ep)
2N/A{
2N/A int count = 0;
2N/A mdname_t *np = NULL;
2N/A mdnamelist_t *transnlp = NULL;
2N/A mdnamelist_t **tailpp = nlpp;
2N/A mdnamelist_t *nlp;
2N/A md_drive_desc *dd, *di;
2N/A
2N/A if (metaislocalset(sp))
2N/A goto out;
2N/A
2N/A if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2N/A count = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* db in for sets on reserved slice */
2N/A for (di = dd; di && count >= 0; di = di->dd_next) {
2N/A uint_t rep_slice;
2N/A
2N/A /*
2N/A * Add the name struct to the end of the
2N/A * namelist but keep a pointer to the last
2N/A * element so that we don't incur the overhead
2N/A * of traversing the list each time
2N/A */
2N/A if (di->dd_dnp &&
2N/A (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2N/A (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2N/A (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2N/A count++;
2N/A else
2N/A count = -1;
2N/A }
2N/A
2N/A /* now find logs */
2N/A if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2N/A count = -1;
2N/A goto out;
2N/A }
2N/A
2N/A for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2N/A mdname_t *transnp = nlp->namep;
2N/A md_trans_t *transp;
2N/A
2N/A if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2N/A count = -1;
2N/A goto out;
2N/A }
2N/A if (transp->lognamep) {
2N/A /*
2N/A * Add the name struct to the end of the
2N/A * namelist but keep a pointer to the last
2N/A * element so that we don't incur the overhead
2N/A * of traversing the list each time
2N/A */
2N/A tailpp = meta_namelist_append_wrapper(
2N/A tailpp, transp->lognamep);
2N/A }
2N/A }
2N/Aout:
2N/A metafreenamelist(transnlp);
2N/A return (count);
2N/A}
2N/A
2N/A/*
2N/A * Entry point to join a node to MultiNode diskset.
2N/A *
2N/A * Validate host in diskset.
2N/A * - Should be in membership list from API
2N/A * - Should not already be joined into diskset.
2N/A * - Set must have drives
2N/A * Assume valid configuration is stored in the set/drive/node records
2N/A * in the local mddb since no node or drive can be added to the MNset
2N/A * unless all drives and nodes are available. Reconfig steps will
2N/A * resync all ALIVE nodes in case of panic in critical areas.
2N/A *
2N/A * Lock down the set.
2N/A * Verify host is a member of this diskset.
2N/A * If drives exist in the configuration, load the mddbs.
2N/A * Set this node to active by notifying master if one exists.
2N/A * If this is the first node active in the diskset, this node
2N/A * becomes the master.
2N/A * Unlock the set.
2N/A *
2N/A * Mirror Resync:
2N/A * If this node is the last node to join the set and clustering
2N/A * isn't running, then start the 'metasync -r' type resync
2N/A * on all mirrors in this diskset.
2N/A * If clustering is running, this resync operation will
2N/A * be handled by the reconfig steps and should NOT
2N/A * be handled during a join operation.
2N/A *
2N/A * There are multiple return values in order to assist
2N/A * the join operation of all sets in the metaset command.
2N/A *
2N/A * Return values:
2N/A * 0 - Node successfully joined to set.
2N/A * -1 - Join attempted but failed
2N/A * - any failure from libmeta calls
2N/A * - node not in the member list
2N/A * -2 - Join not attempted since
2N/A * - this set had no drives in set
2N/A * - this node already joined to set
2N/A * - set is not a multinode set
2N/A * -3 - Node joined to STALE set.
2N/A */
2N/Aextern int
2N/Ameta_set_join(
2N/A mdsetname_t *sp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A md_drive_desc *dd;
2N/A md_mnnode_desc *nd, *nd2, my_nd;
2N/A int rval = 0;
2N/A md_setkey_t *cl_sk;
2N/A md_error_t xep = mdnullerror;
2N/A md_error_t ep_snarf = mdnullerror;
2N/A int master_flag = 0;
2N/A md_mnset_record *mas_mnsr = NULL;
2N/A int clear_nr_flags = 0;
2N/A md_mnnode_record *nr;
2N/A int stale_set = 0;
2N/A int rb_flags = 0;
2N/A int stale_bool = FALSE;
2N/A int suspendall_flag = 0;
2N/A int suspend1_flag = 0;
2N/A sigset_t oldsigs;
2N/A int send_reinit = 0;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A return (-1);
2N/A }
2N/A
2N/A /* Must be a multinode diskset */
2N/A if (!MD_MNSET_DESC(sd)) {
2N/A (void) mderror(ep, MDE_NOT_MN, sp->setname);
2N/A return (-2);
2N/A }
2N/A
2N/A /* Verify that the node is ALIVE (i.e. is in the API membership list) */
2N/A if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2N/A sd->sd_mn_mynode->nd_nodename, NULL, sp->setname);
2N/A return (-1);
2N/A }
2N/A
2N/A /* Make sure we are blocking all signals */
2N/A if (procsigs(TRUE, &oldsigs, &xep) < 0)
2N/A mdclrerror(&xep);
2N/A
2N/A /*
2N/A * Lock the set on current set members.
2N/A * For MN diskset lock_set and SUSPEND are used to protect against
2N/A * other meta* commands running on the other nodes.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * Lock out other meta* commands by suspending
2N/A * class 1 messages across the diskset.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2N/A sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A suspend1_flag = 1;
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * Verify that this host is a member (in the host list) of the set.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (strcmp(mynode(), nd->nd_nodename) == 0) {
2N/A break;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A if (!nd) {
2N/A (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2N/A sd->sd_mn_mynode->nd_nodename, NULL,
2N/A sp->setname);
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /*
2N/A * Need to return failure if host is already 'joined'
2N/A * into the set. This is done so that if later the user
2N/A * issues a command to join all sets and a failure is
2N/A * encountered - that the resulting cleanup effort
2N/A * (withdrawing from all sets that were joined
2N/A * during that command) won't withdraw from this set.
2N/A */
2N/A if (nd->nd_flags & MD_MN_NODE_OWN) {
2N/A rval = -2;
2N/A goto out2;
2N/A }
2N/A
2N/A /*
2N/A * Call metaget_setownership that calls each node in diskset and
2N/A * marks in set descriptor if node is an owner of the set or not.
2N/A * metaget_setownership checks to see if a node is an owner by
2N/A * checking to see if that node's kernel has the mddb loaded.
2N/A * If a node had panic'd during a reconfig or an
2N/A * add/delete/join/withdraw operation, the other nodes' node
2N/A * records may not reflect the current state of the diskset,
2N/A * so calling metaget_setownership is the safest thing to do.
2N/A */
2N/A if (metaget_setownership(sp, ep) == -1) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* If first active member of diskset, become the master. */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (nd->nd_flags & MD_MN_NODE_OWN)
2N/A break;
2N/A nd = nd->nd_next;
2N/A }
2N/A if (nd == NULL)
2N/A master_flag = 1;
2N/A
2N/A /*
2N/A * If not first active member of diskset, then get the
2N/A * master information from a node that is already joined
2N/A * and set the master information for this node. Be sure
2N/A * that this node (the already joined node) has its own
2N/A * join flag set. If not, then this diskset isn't currently
2N/A * consistent and shouldn't allow a node to join. This diskset
2N/A * inconsistency should only occur when a node has panic'd in
2N/A * the set while doing a metaset operation and the sysadmin is
2N/A * attempting to join a node into the set. This inconsistency
2N/A * will be fixed during a reconfig cycle which should be occurring
2N/A * soon since a node panic'd.
2N/A *
2N/A * If unable to get this information from an owning node, then
2N/A * this diskset isn't currently consistent and shouldn't
2N/A * allow a node to join.
2N/A */
2N/A if (!master_flag) {
2N/A /* get master information from an owner (joined) node */
2N/A if (clnt_mngetset(nd->nd_nodename, sp->setname,
2N/A sp->setno, &mas_mnsr, ep) == -1) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* Verify that owner (joined) node has its own JOIN flag set */
2N/A nr = mas_mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if ((nd->nd_nodeid == nr->nr_nodeid) &&
2N/A ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2N/A (void) mddserror(ep, MDE_DS_NODENOSET,
2N/A sp->setno, nd->nd_nodename, NULL,
2N/A nd->nd_nodename);
2N/A free_sr((md_set_record *)mas_mnsr);
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A
2N/A /*
2N/A * Does master have set marked as STALE?
2N/A * If so, need to pass this down to kernel when
2N/A * this node snarfs the set.
2N/A */
2N/A if (clnt_mn_is_stale(nd->nd_nodename, sp,
2N/A &stale_bool, ep) == -1) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* set master information in my rpc.metad's set record */
2N/A if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2N/A mas_mnsr->sr_master_nodeid, ep)) {
2N/A free_sr((md_set_record *)mas_mnsr);
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* set master information in my cached set desc */
2N/A (void) strcpy(sd->sd_mn_master_nodenm,
2N/A mas_mnsr->sr_master_nodenm);
2N/A sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2N/A nd2 = sd->sd_nodelist;
2N/A while (nd2) {
2N/A if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2N/A sd->sd_mn_masternode = nd2;
2N/A break;
2N/A }
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A free_sr((md_set_record *)mas_mnsr);
2N/A
2N/A /*
2N/A * Set the node flags in mynode's rpc.metad node records for
2N/A * the nodes that are in the diskset. Can use my sd
2N/A * since earlier call to metaget_setownership set the
2N/A * owner flags based on whether that node had snarfed
2N/A * the MN diskset mddb. Reconfig steps guarantee that
2N/A * return of metaget_setownership will match the owning
2N/A * node's owner list except in the case where a node
2N/A * has just panic'd and in this case, a reconfig will
2N/A * be starting immediately and the owner lists will
2N/A * be sync'd up by the reconfig.
2N/A *
2N/A * Flag of SET means to take no action except to
2N/A * set the node flags as given in the nodelist linked list.
2N/A */
2N/A if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2N/A MD_NR_SET, NULL, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Read in the mddb if there are drives in the set.
2N/A */
2N/A if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2N/A ep)) == NULL) {
2N/A /* No drives in list */
2N/A if (! mdisok(ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A rval = -2;
2N/A goto out;
2N/A }
2N/A
2N/A /*
2N/A * Notify rpc.mdcommd on all nodes of a nodelist change.
2N/A * Start by suspending rpc.mdcommd (which drains it of all messages),
2N/A * then change the nodelist followed by a reinit and resume.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2N/A MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A suspendall_flag = 1;
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /* Set master in my set record in rpc.metad */
2N/A if (master_flag) {
2N/A if (clnt_mnsetmaster(mynode(), sp,
2N/A sd->sd_mn_mynode->nd_nodename,
2N/A sd->sd_mn_mynode->nd_nodeid, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A /*
2N/A * Causes mddbs to be loaded into the kernel.
2N/A * Set the force flag so that replica locations can be
2N/A * loaded into the kernel even if a mediator node was
2N/A * unavailable. This allows a node to join an MO
2N/A * diskset when there are sufficient replicas available,
2N/A * but a mediator node in unavailable.
2N/A */
2N/A if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Host not able to start diskset."));
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A if (! mdisok(ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /*
2N/A * Set rollback flags to 1 so that halt_set is called if a failure
2N/A * is seen after this point. If snarf_set fails, still need to
2N/A * call halt_set to cleanup the diskset.
2N/A */
2N/A rb_flags = 1;
2N/A
2N/A /* Starts the set */
2N/A if (snarf_set(sp, stale_bool, ep) != 0) {
2N/A if (mdismddberror(ep, MDE_DB_STALE)) {
2N/A /*
2N/A * Don't fail join, STALE means that set has
2N/A * < 50% mddbs.
2N/A */
2N/A (void) mdstealerror(&ep_snarf, ep);
2N/A stale_set = 1;
2N/A } else if (mdisok(ep)) {
2N/A /* If snarf failed, but no error was set - set it */
2N/A (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2N/A sp->setno, 0, NULL);
2N/A rval = -1;
2N/A goto out;
2N/A } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2N/A /*
2N/A * Don't fail join if ACCOK; ACCOK means that mediator
2N/A * provided extra vote.
2N/A */
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /* Did set really get snarfed? */
2N/A if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2N/A if (mdisok(ep)) {
2N/A /* If snarf failed, but no error was set - set it */
2N/A (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
2N/A sp->setno, 0, NULL);
2N/A }
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Host not able to start diskset."));
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* Change to nodelist so need to send reinit to rpc.mdcommd */
2N/A send_reinit = 1;
2N/A
2N/A /* If first node to enter set, setup master and clear change log */
2N/A if (master_flag) {
2N/A /* Set master in my locally cached set descriptor */
2N/A (void) strcpy(sd->sd_mn_master_nodenm,
2N/A sd->sd_mn_mynode->nd_nodename);
2N/A sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2N/A sd->sd_mn_am_i_master = 1;
2N/A
2N/A /*
2N/A * If first node to join set, then clear out change log
2N/A * entries. Change log entries are only needed when a
2N/A * change of master is occurring in a diskset that has
2N/A * multiple owners. Since this node is the first owner
2N/A * of the diskset, clear the entries.
2N/A *
2N/A * Only do this if we are in a single node non-SC3.x
2N/A * situation.
2N/A */
2N/A if (meta_mn_singlenode() &&
2N/A mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to reset changelog."));
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /* Set my locally cached flag */
2N/A sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2N/A
2N/A /*
2N/A * Set this node's own flag on all joined nodes in the set
2N/A * (including my node).
2N/A */
2N/A clear_nr_flags = 1;
2N/A
2N/A my_nd = *(sd->sd_mn_mynode);
2N/A my_nd.nd_next = NULL;
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2N/A MD_NR_JOIN, NULL, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/Aout:
2N/A if (rval != NULL) {
2N/A /*
2N/A * If rollback flag is 1, then node was joined to set.
2N/A * Since an error occurred, withdraw node from set in
2N/A * order to rollback to before command was run.
2N/A * Need to preserve ep so that calling function can
2N/A * get error information.
2N/A */
2N/A if (rb_flags == 1) {
2N/A if (halt_set(sp, &xep)) {
2N/A mdclrerror(&xep);
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * If error, reset master to INVALID.
2N/A * Ignore error since (next) first node to successfully join
2N/A * will set master on all nodes.
2N/A */
2N/A (void) clnt_mnsetmaster(mynode(), sp, "",
2N/A MD_MN_INVALID_NID, &xep);
2N/A mdclrerror(&xep);
2N/A /* Reset master in my locally cached set descriptor */
2N/A sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2N/A sd->sd_mn_am_i_master = 0;
2N/A
2N/A /*
2N/A * If nr flags set on other nodes, reset them.
2N/A */
2N/A if (clear_nr_flags) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A (void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2N/A &my_nd, MD_NR_WITHDRAW, NULL, &xep);
2N/A mdclrerror(&xep);
2N/A nd = nd->nd_next;
2N/A }
2N/A /* Reset my locally cached flag */
2N/A sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Notify rpc.mdcommd on all nodes of a nodelist change.
2N/A * Send reinit command to mdcommd which forces it to get
2N/A * fresh set description.
2N/A */
2N/A if (send_reinit) {
2N/A /* Send reinit */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Class is ignored for REINIT */
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2N/A sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2N/A /*
2N/A * We are here because we failed to resume
2N/A * rpc.mdcommd. However we potentially have
2N/A * an error from the previous call
2N/A * If the previous call did fail, we capture
2N/A * that error and generate a perror with
2N/A * the string, "Unable to resume...".
2N/A * Setting rval to -1 ensures that in the
2N/A * next iteration of the loop, ep is not
2N/A * clobbered.
2N/A */
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to reinit rpc.mdcommd."));
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A }
2N/A
2N/Aout2:
2N/A /*
2N/A * Unlock diskset by resuming messages across the diskset.
2N/A * Just resume all classes so that resume is the same whether
2N/A * just one class was locked or all classes were locked.
2N/A */
2N/A if ((suspend1_flag) || (suspendall_flag)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2N/A sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2N/A /*
2N/A * We are here because we failed to resume
2N/A * rpc.mdcommd. However we potentially have
2N/A * an error from the previous call
2N/A * If the previous call did fail, we capture
2N/A * that error and generate a perror with
2N/A * the string, "Unable to resume...".
2N/A * Setting rval to -1 ensures that in the
2N/A * next iteration of the loop, ep is not
2N/A * clobbered.
2N/A */
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to resume rpc.mdcommd."));
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A meta_ping_mnset(sp->setno);
2N/A }
2N/A
2N/A /*
2N/A * Unlock set. This flushes the caches on the servers.
2N/A */
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * If this node is the last to join the diskset and clustering isn't
2N/A * running, then resync the mirrors in the diskset. We have to wait
2N/A * until all nodes are joined so that the status gets propagated to
2N/A * all of the members of the set.
2N/A * Ignore any error from the resync as the join function shouldn't fail
2N/A * because the mirror resync had a problem.
2N/A *
2N/A * Don't start resync if set is stale.
2N/A */
2N/A if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2N/A (stale_set != 1)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN))
2N/A break;
2N/A nd = nd->nd_next;
2N/A }
2N/A /*
2N/A * nd set to NULL means that we have no nodes in the set that
2N/A * haven't joined. In this case we start the resync.
2N/A */
2N/A if (nd == NULL) {
2N/A (void) meta_mirror_resync_all(sp, 0, &xep);
2N/A mdclrerror(&xep);
2N/A }
2N/A }
2N/A
2N/A /* Update ABR state for all soft partitions */
2N/A (void) meta_sp_update_abr(sp, &xep);
2N/A mdclrerror(&xep);
2N/A
2N/A /*
2N/A * call metaflushsetnames to reset local cache for master and
2N/A * node information.
2N/A */
2N/A metaflushsetname(sp);
2N/A
2N/A /* release signals back to what they were on entry */
2N/A if (procsigs(FALSE, &oldsigs, &xep) < 0)
2N/A mdclrerror(&xep);
2N/A
2N/A /*
2N/A * If no error and stale_set is set, then set ep back
2N/A * to ep from snarf_set call and return -3. If another error
2N/A * occurred and rval is not 0, then that error would have
2N/A * caused the node to be withdrawn from the set and would
2N/A * have set ep to that error information.
2N/A */
2N/A if ((rval == 0) && (stale_set)) {
2N/A (void) mdstealerror(ep, &ep_snarf);
2N/A return (-3);
2N/A }
2N/A
2N/A return (rval);
2N/A}
2N/A
2N/A/*
2N/A * Entry point to withdraw a node from MultiNode diskset.
2N/A *
2N/A * Validate host in diskset.
2N/A * - Should be joined into diskset.
2N/A * Assume valid configuration is stored in the set/drive/node records
2N/A * in the local mddb since no node or drive can be added to the MNset
2N/A * unless all drives and nodes are available. Reconfig steps will
2N/A * resync all ALIVE nodes in case of panic in critical areas.
2N/A *
2N/A * Lock down the set.
2N/A * Verify that drives exist in configuration.
2N/A * Verify host is a member of this diskset.
2N/A * Verify host is an owner of the diskset (host is joined to diskset).
2N/A * Only allow withdrawal of master node if master node is the only joined
2N/A * in the diskset.
2N/A * Halt the diskset on this node.
2N/A * Reset Master on this node.
2N/A * Updated node flags that this node with withdrawn.
2N/A * Unlock the set.
2N/A *
2N/A * Return values:
2N/A * 0 - Node successfully withdrew from set.
2N/A * -1 - Withdrawal attempted but failed
2N/A * - any failure from libmeta calls
2N/A * - node not in the member list
2N/A * -2 - Withdrawal not attempted since
2N/A * - this set had no drives in set
2N/A * - this node not joined to set
2N/A * - set is not a multinode set
2N/A */
2N/Aextern int
2N/Ameta_set_withdraw(
2N/A mdsetname_t *sp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A md_drive_desc *dd = 0;
2N/A md_mnnode_desc *nd, my_nd;
2N/A int rval = 0;
2N/A md_setkey_t *cl_sk;
2N/A md_error_t xep = mdnullerror;
2N/A int set_halted = 0;
2N/A int suspendall_flag = 0;
2N/A int suspend1_flag = 0;
2N/A bool_t stale_bool = FALSE;
2N/A mddb_config_t c;
2N/A int node_id_list[1];
2N/A sigset_t oldsigs;
2N/A int send_reinit = 0;
2N/A
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A return (-1);
2N/A }
2N/A
2N/A /* Must be a multinode diskset */
2N/A if (!MD_MNSET_DESC(sd)) {
2N/A (void) mderror(ep, MDE_NOT_MN, sp->setname);
2N/A return (-1);
2N/A }
2N/A
2N/A /* Make sure we are blocking all signals */
2N/A if (procsigs(TRUE, &oldsigs, &xep) < 0)
2N/A mdclrerror(&xep);
2N/A
2N/A /*
2N/A * Lock the set on current set members.
2N/A * For MN diskset lock_set and SUSPEND are used to protect against
2N/A * other meta* commands running on the other nodes.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A /*
2N/A * Lock out other meta* commands by suspending
2N/A * class 1 messages across the diskset.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2N/A sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A suspend1_flag = 1;
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /* Get list of drives - needed in case of failure */
2N/A if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2N/A ep)) == NULL) {
2N/A /* Error getting drives in list */
2N/A if (! mdisok(ep)) {
2N/A rval = -1;
2N/A goto out2;
2N/A }
2N/A /* no drives in list */
2N/A rval = -2;
2N/A goto out2;
2N/A }
2N/A
2N/A /*
2N/A * Verify that this host is a member (in the host list) of the set.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (strcmp(mynode(), nd->nd_nodename) == 0) {
2N/A break;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A if (!nd) {
2N/A (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2N/A sd->sd_mn_mynode->nd_nodename, NULL,
2N/A sp->setname);
2N/A rval = -1;
2N/A goto out2;
2N/A }
2N/A
2N/A /*
2N/A * Call metaget_setownership that calls each node in diskset and
2N/A * marks in set descriptor if node is an owner of the set or not.
2N/A * metaget_setownership checks to see if a node is an owner by
2N/A * checking to see if that node's kernel has the mddb loaded.
2N/A * If a node had panic'd during a reconfig or an
2N/A * add/delete/join/withdraw operation, the other nodes' node
2N/A * records may not reflect the current state of the diskset,
2N/A * so calling metaget_setownership is the safest thing to do.
2N/A */
2N/A if (metaget_setownership(sp, ep) == -1) {
2N/A rval = -1;
2N/A goto out2;
2N/A }
2N/A
2N/A /*
2N/A * Verify that this node is joined
2N/A * to diskset (i.e. is an owner of the diskset).
2N/A */
2N/A if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
2N/A rval = -2;
2N/A goto out2;
2N/A }
2N/A
2N/A /*
2N/A * For a MN diskset, only withdraw master if it is
2N/A * the only joined node.
2N/A */
2N/A if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Skip my node since checking for other owners */
2N/A if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A /* If another owner node if found, error */
2N/A if (nd->nd_flags & MD_MN_NODE_OWN) {
2N/A (void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
2N/A sp->setno,
2N/A sd->sd_mn_mynode->nd_nodename, NULL,
2N/A sp->setname);
2N/A rval = -1;
2N/A goto out2;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Is current set STALE?
2N/A */
2N/A (void) memset(&c, 0, sizeof (c));
2N/A c.c_id = 0;
2N/A c.c_setno = sp->setno;
2N/A if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2N/A (void) mdstealerror(ep, &c.c_mde);
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A if (c.c_flags & MDDB_C_STALE) {
2N/A stale_bool = TRUE;
2N/A }
2N/A
2N/A /*
2N/A * Notify rpc.mdcommd on all nodes of a nodelist change.
2N/A * Start by suspending rpc.mdcommd (which drains it of all messages),
2N/A * then change the nodelist followed by a reinit and resume.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2N/A sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A suspendall_flag = 1;
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * Withdraw the set - halt set.
2N/A * This will fail if any I/O is occuring to any metadevice which
2N/A * includes a resync to a mirror metadevice.
2N/A */
2N/A set_halted = 1;
2N/A if (halt_set(sp, ep)) {
2N/A /* Was set actually halted? */
2N/A if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
2N/A set_halted = 0;
2N/A }
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* Change to nodelist so need to send reinit to rpc.mdcommd */
2N/A send_reinit = 1;
2N/A
2N/A /* Reset master on withdrawn node */
2N/A if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
2N/A MD_MN_INVALID_NID, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* Mark my node as withdrawn and send to other nodes */
2N/A nd = sd->sd_nodelist;
2N/A my_nd = *(sd->sd_mn_mynode); /* structure copy */
2N/A my_nd.nd_next = NULL;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2N/A MD_NR_WITHDRAW, NULL, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * If withdrawn node is a mirror owner, reset mirror owner
2N/A * to NULL. If an error occurs, print a warning and continue.
2N/A * Don't fail metaset because of mirror owner reset problem since
2N/A * next node to grab mirror will resolve this issue.
2N/A * Before next node grabs mirrors, metaset will show the withdrawn
2N/A * node as owner which is why an attempt to reset the mirror owner
2N/A * is made.
2N/A */
2N/A node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
2N/A 1, &node_id_list[0], &xep) == 01) {
2N/A mde_perror(&xep, dgettext(TEXT_DOMAIN,
2N/A "Unable to reset mirror owner on node %s"),
2N/A nd->nd_nodename);
2N/A mdclrerror(&xep);
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/Aout:
2N/A if (rval == -1) {
2N/A /* Rejoin node - Mark node as joined and send to other nodes */
2N/A nd = sd->sd_nodelist;
2N/A my_nd = *(sd->sd_mn_mynode); /* structure copy */
2N/A my_nd.nd_next = NULL;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2N/A MD_NR_JOIN, NULL, &xep)) {
2N/A mdclrerror(&xep);
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /* Set master on withdrawn node */
2N/A if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
2N/A sd->sd_mn_master_nodenm,
2N/A sd->sd_mn_master_nodeid, &xep)) {
2N/A mdclrerror(&xep);
2N/A }
2N/A
2N/A /* Join set if halt_set had succeeded */
2N/A if (set_halted) {
2N/A /*
2N/A * Causes mddbs to be loaded into the kernel.
2N/A * Set the force flag so that replica locations can be
2N/A * loaded into the kernel even if a mediator node was
2N/A * unavailable. This allows a node to join an MO
2N/A * diskset when there are sufficient replicas available,
2N/A * but a mediator node in unavailable.
2N/A */
2N/A if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
2N/A mdclrerror(&xep);
2N/A }
2N/A /* If set previously stale - make it so at re-join */
2N/A if (snarf_set(sp, stale_bool, &xep) != 0) {
2N/A mdclrerror(&xep);
2N/A (void) halt_set(sp, &xep);
2N/A mdclrerror(&xep);
2N/A }
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Notify rpc.mdcommd on all nodes of a nodelist change.
2N/A * Send reinit command to mdcommd which forces it to get
2N/A * fresh set description.
2N/A */
2N/A if (send_reinit) {
2N/A /* Send reinit */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Class is ignored for REINIT */
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2N/A sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2N/A /*
2N/A * We are here because we failed to resume
2N/A * rpc.mdcommd. However we potentially have
2N/A * an error from the previous call.
2N/A * If the previous call did fail, we
2N/A * capture that error and generate a perror
2N/A * withthe string, "Unable to resume...".
2N/A * Setting rval to -1 ensures that in the
2N/A * next iteration of the loop, ep is not
2N/A * clobbered.
2N/A */
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to reinit rpc.mdcommd."));
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A }
2N/A
2N/Aout2:
2N/A /*
2N/A * Unlock diskset by resuming messages across the diskset.
2N/A * Just resume all classes so that resume is the same whether
2N/A * just one class was locked or all classes were locked.
2N/A */
2N/A if ((suspend1_flag) || (suspendall_flag)) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2N/A sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2N/A /*
2N/A * We are here because we failed to resume
2N/A * rpc.mdcommd. However we potentially have
2N/A * an error from the previous call
2N/A * If the previous call did fail, we capture
2N/A * that error and generate a perror with
2N/A * the string, "Unable to resume...".
2N/A * Setting rval to -1 ensures that in the
2N/A * next iteration of the loop, ep is not
2N/A * clobbered.
2N/A */
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to resume rpc.mdcommd."));
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A meta_ping_mnset(sp->setno);
2N/A }
2N/A
2N/A /*
2N/A * Unlock set. This flushes the caches on the servers.
2N/A */
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2N/A if (rval == 0)
2N/A (void) mdstealerror(ep, &xep);
2N/A else
2N/A mdclrerror(&xep);
2N/A rval = -1;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * call metaflushsetnames to reset local cache for master and
2N/A * node information.
2N/A */
2N/A metaflushsetname(sp);
2N/A
2N/A /* release signals back to what they were on entry */
2N/A if (procsigs(FALSE, &oldsigs, &xep) < 0)
2N/A mdclrerror(&xep);
2N/A
2N/A return (rval);
2N/A
2N/A}
2N/A
2N/A/*
2N/A * Update nodelist with cluster member information.
2N/A * A node not in the member list will be marked
2N/A * as not ALIVE and not OWN.
2N/A * A node in the member list will be marked ALIVE, but
2N/A * the OWN bit will not be changed.
2N/A *
2N/A * If mynode isn't in the membership list, fail causing
2N/A * another reconfig cycle to be started since a non-member
2N/A * node shouldn't be taking part in the reconfig cycle.
2N/A *
2N/A * Return values:
2N/A * 0 - No problem.
2N/A * 1 - Any failure including RPC failure to my node.
2N/A */
2N/Aint
2N/Ameta_reconfig_update_nodelist(
2N/A mdsetname_t *sp,
2N/A mndiskset_membershiplist_t *nl,
2N/A md_set_desc *sd,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A mndiskset_membershiplist_t *nl2;
2N/A md_mnnode_desc *nd;
2N/A md_error_t xep = mdnullerror;
2N/A int rval = 0;
2N/A
2N/A /*
2N/A * Walk through nodelist, checking to see if each
2N/A * node is in the member list.
2N/A * If node is not a member, reset ALIVE and OWN node flag.
2N/A * If node is a member, set ALIVE.
2N/A * If mynode's OWN flag gets reset, then halt the diskset on this node.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A nl2 = nl;
2N/A while (nl2) {
2N/A /* If node is in member list, set ALIVE */
2N/A if (nl2->msl_node_id == nd->nd_nodeid) {
2N/A nd->nd_flags |= MD_MN_NODE_ALIVE;
2N/A break;
2N/A } else {
2N/A nl2 = nl2->next;
2N/A }
2N/A /* node is not in member list, mark !ALIVE and !OWN */
2N/A if (nl2 == NULL) {
2N/A /* If node is mynode, then halt set if needed */
2N/A if (strcmp(mynode(), nd->nd_nodename) == 0) {
2N/A /*
2N/A * This shouldn't happen, but just
2N/A * in case... Any node not in the
2N/A * membership list should be dead and
2N/A * not running reconfig step1.
2N/A */
2N/A if (nd->nd_flags & MD_MN_NODE_OWN) {
2N/A if (halt_set(sp, &xep)) {
2N/A mde_perror(&xep, "");
2N/A mdclrerror(&xep);
2N/A }
2N/A }
2N/A /*
2N/A * Return failure since this node
2N/A * (mynode) is not in the membership
2N/A * list, but process the rest of the
2N/A * nodelist first so that rpc.metad
2N/A * can be updated with the latest
2N/A * membership information.
2N/A */
2N/A (void) mddserror(ep,
2N/A MDE_DS_NOTINMEMBERLIST,
2N/A sp->setno, nd->nd_nodename, NULL,
2N/A sp->setname);
2N/A rval = 1;
2N/A }
2N/A nd->nd_flags &= ~MD_MN_NODE_ALIVE;
2N/A nd->nd_flags &= ~MD_MN_NODE_OWN;
2N/A }
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /* Send this information to rpc.metad */
2N/A if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2N/A MD_NR_SET, MNSET_IN_RECONFIG, &xep)) {
2N/A /* Return failure if can't send node flags to rpc.metad */
2N/A if (rval == 0) {
2N/A (void) mdstealerror(ep, &xep);
2N/A rval = 1;
2N/A }
2N/A }
2N/A return (rval);
2N/A}
2N/A
2N/A/*
2N/A * Choose master determines the master for a diskset.
2N/A * Each node determines the master on its own and
2N/A * adds this information to its local rpc.metad nodelist
2N/A * and also sends it to the kernel.
2N/A *
2N/A * Nodelist in set descriptor (sd) is sorted in
2N/A * monotonically increasing sequence of nodeid.
2N/A *
2N/A * Return values:
2N/A * 0 - No problem.
2N/A * 205 - There was an RPC problem to another node.
2N/A * -1 - There was an error. This could be an RPC error to my node.
2N/A * This is a catastrophic failure causing node to panic.
2N/A */
2N/Aint
2N/Ameta_reconfig_choose_master_for_set(
2N/A mdsetname_t *sp,
2N/A md_set_desc *sd,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A int is_owner;
2N/A md_mnset_record *mnsr = NULL;
2N/A int lowest_alive_nodeid = 0;
2N/A uint_t master_nodeid;
2N/A md_mnnode_desc *nd, *nd2;
2N/A md_mnnode_record *nr;
2N/A md_drive_desc *dd;
2N/A md_setkey_t *cl_sk;
2N/A int rval = 0;
2N/A md_error_t xep = mdnullerror;
2N/A mddb_setflags_config_t sf;
2N/A
2N/A /*
2N/A * Is current node joined to diskset?
2N/A * Don't trust flags, really check to see if mddb is snarfed.
2N/A */
2N/A if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
2N/A /*
2N/A * If a node is joined to the diskset, this node checks
2N/A * to see if the current master of the diskset is valid and
2N/A * is still in the membership list (ALIVE) and is
2N/A * still joined (OWN). Need to verify if master is
2N/A * really joined - don't trust the flags. (Can trust
2N/A * ALIVE since set during earlier part of reconfig cycle.)
2N/A * If the current master is valid, still in the membership
2N/A * list and joined, then master is not changed on this node.
2N/A * Just return.
2N/A *
2N/A * Verify that nodeid is valid before accessing masternode.
2N/A */
2N/A if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
2N/A (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
2N/A &is_owner, ep) == -1) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A sd->sd_mn_master_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A } else {
2N/A if (is_owner == TRUE) {
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(
2N/A TEXT_DOMAIN, "Set %s previous "
2N/A "master chosen %s (%d): %s"),
2N/A sp->setname,
2N/A sd->sd_mn_master_nodenm,
2N/A sd->sd_mn_master_nodeid,
2N/A meta_print_hrtime(gethrtime() -
2N/A start_time));
2N/A
2N/A /* Previous master is ok - done */
2N/A return (0);
2N/A }
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * If current master is no longer in the membership list or
2N/A * is no longer joined, then this node uses the following
2N/A * algorithm:
2N/A * - node calls RPC routine clnt_ownset to get latest
2N/A * information on which nodes are owners of diskset.
2N/A * clnt_ownset checks on each node to see if its kernel
2N/A * has that diskset snarfed.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Don't consider node that isn't in member list */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A if (clnt_ownset(nd->nd_nodename, sp,
2N/A &is_owner, ep) == -1) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Set owner flag for each node based on whether
2N/A * that node really has a diskset mddb snarfed in
2N/A * or not.
2N/A */
2N/A if (is_owner == TRUE)
2N/A nd->nd_flags |= MD_MN_NODE_OWN;
2N/A else
2N/A nd->nd_flags &= ~MD_MN_NODE_OWN;
2N/A
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * - node walks through nodelist looking for nodes that are
2N/A * owners of the diskset that are in the membership list.
2N/A * - for each owner, node calls RPC routine clnt_getset to
2N/A * see if that node has its node record set to OK.
2N/A * - If so, master is chosen to be this owner node.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Don't consider node that isn't in member list */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Don't consider a node that isn't an owner */
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Does node has its own node record set to OK? */
2N/A if (clnt_mngetset(nd->nd_nodename, sp->setname,
2N/A MD_SET_BAD, &mnsr, ep) == -1) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A }
2N/A nr = mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if (nd->nd_nodeid == nr->nr_nodeid) {
2N/A if (nr->nr_flags & MD_MN_NODE_OK) {
2N/A /* Found a master */
2N/A free_sr(
2N/A (md_set_record *)mnsr);
2N/A goto found_master;
2N/A }
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A free_sr((md_set_record *)mnsr);
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * - If no owner node has its own node record on its own node
2N/A * set to OK, then this node checks all of the non-owner
2N/A * nodes that are in the membership list.
2N/A * - for each non-owner, node calls RPC routine clnt_getset to
2N/A * see if that node has its node record set to OK.
2N/A * - If set doesn't exist, don't choose node for master.
2N/A * - If so, master is chosen to be this non-owner node.
2N/A *
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Don't consider node that isn't in member list */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Only checking non-owner nodes this time around */
2N/A if (nd->nd_flags & MD_MN_NODE_OWN) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Does node has its own node record set to OK? */
2N/A if (clnt_mngetset(nd->nd_nodename, sp->setname,
2N/A MD_SET_BAD, &mnsr, ep) == -1) {
2N/A /*
2N/A * If set doesn't exist on non-owner node,
2N/A * don't consider this node for master.
2N/A */
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A } else if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A /* RPC failure to another node */
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A }
2N/A nr = mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if (nd->nd_nodeid == nr->nr_nodeid) {
2N/A if (nr->nr_flags & MD_MN_NODE_OK) {
2N/A /* Found a master */
2N/A free_sr(
2N/A (md_set_record *)mnsr);
2N/A goto found_master;
2N/A }
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A free_sr((md_set_record *)mnsr);
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * - If no node can be found that has its own node record on
2N/A * its node to be set to OK, then all alive nodes
2N/A * were in the process of being added to or deleted
2N/A * from set. Each alive node will remove all
2N/A * information pertaining to this set from its node.
2N/A *
2N/A * If all nodes in set are ALIVE, then call sdssc end routines
2N/A * since set was truly being initially created or destroyed.
2N/A */
2N/A goto delete_set;
2N/A } else {
2N/A
2N/A /*
2N/A * If node is not joined to diskset, then this
2N/A * node uses the following algorithm:
2N/A * - If unjoined node doesn't have a node record for itself,
2N/A * just delete the diskset since diskset was in the
2N/A * process of being created.
2N/A * - node needs to find master of diskset before
2N/A * reconfig cycle, if a master existed.
2N/A * - node calls RPC routine clnt_ownset to get latest
2N/A * information on which nodes are owners of diskset.
2N/A * clnt_ownset checks on each node to see if its
2N/A * kernel has that diskset snarfed.
2N/A */
2N/A
2N/A /*
2N/A * Is my node in the set description?
2N/A * If not, delete the set from this node.
2N/A * sr2setdesc sets sd_mn_mynode pointer to the node
2N/A * descriptor for this node if there was a node
2N/A * record for this node.
2N/A *
2N/A */
2N/A if (sd->sd_mn_mynode == NULL) {
2N/A goto delete_set;
2N/A }
2N/A
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Don't consider node that isn't in member list */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A if (clnt_ownset(nd->nd_nodename, sp,
2N/A &is_owner, ep) == -1) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Set owner flag for each node based on whether
2N/A * that node really has a diskset mddb snarfed in
2N/A * or not.
2N/A */
2N/A if (is_owner == TRUE)
2N/A nd->nd_flags |= MD_MN_NODE_OWN;
2N/A else
2N/A nd->nd_flags &= ~MD_MN_NODE_OWN;
2N/A
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * - node walks through nodelist looking for nodes that
2N/A * are owners of the diskset that are in
2N/A * the membership list.
2N/A * - for each owner, node calls RPC routine clnt_getset to
2N/A * see if that node has a master set and to get the
2N/A * diskset description.
2N/A * - If the owner node has a set description that doesn't
2N/A * include the non-joined node in the nodelist, this node
2N/A * removes its set description of that diskset
2N/A * (i.e. removes the set from its local mddbs). This is
2N/A * handling the case of when a node was removed from a
2N/A * diskset while it was not in the cluster membership
2N/A * list.
2N/A * - If that node has a master set and the master is in the
2N/A * membership list and is an owner, then either this was
2N/A * the master from before the reconfig cycle or this
2N/A * node has already chosen a new master - either way,
2N/A * the master value is valid as long as it is in the
2N/A * membership list and is an owner
2N/A * - master is chosen to be owner node's master
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Don't consider node that isn't in member list */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Don't consider a node that isn't an owner */
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Get owner node's set record */
2N/A if (clnt_mngetset(nd->nd_nodename, sp->setname,
2N/A MD_SET_BAD, &mnsr, ep) == -1) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A }
2N/A
2N/A /* Is this node in the owner node's set record */
2N/A nr = mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if (sd->sd_mn_mynode->nd_nodeid ==
2N/A nr->nr_nodeid) {
2N/A break;
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A if (nr == NULL) {
2N/A /* my node not found - delete set */
2N/A free_sr((md_set_record *)mnsr);
2N/A goto delete_set;
2N/A }
2N/A
2N/A /* Is owner's node's master valid? */
2N/A master_nodeid = mnsr->sr_master_nodeid;
2N/A free_sr((md_set_record *)mnsr);
2N/A if (master_nodeid == MD_MN_INVALID_NID) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A nd2 = sd->sd_nodelist;
2N/A while (nd2) {
2N/A if ((nd2->nd_nodeid == master_nodeid) &&
2N/A (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
2N/A (nd2->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd2;
2N/A goto found_master;
2N/A }
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * - If no owner node has a valid master, then follow
2N/A * algorithm of when a node is joined to the diskset.
2N/A * - node walks through nodelist looking for nodes that are
2N/A * owners of the diskset that are in the membership list.
2N/A * - for each owner, node calls RPC routine clnt_getset to
2N/A * see if that node has its node record set to OK.
2N/A * - If so, master is chosen to be this owner node.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Don't consider node that isn't in member list */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Don't consider a node that isn't an owner */
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Does node has its own node record set to OK? */
2N/A if (clnt_mngetset(nd->nd_nodename, sp->setname,
2N/A MD_SET_BAD, &mnsr, ep) == -1) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A }
2N/A nr = mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if (nd->nd_nodeid == nr->nr_nodeid) {
2N/A if (nr->nr_flags & MD_MN_NODE_OK) {
2N/A /* Found a master */
2N/A free_sr(
2N/A (md_set_record *)mnsr);
2N/A goto found_master;
2N/A }
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A free_sr((md_set_record *)mnsr);
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * - If no owner node has its own node record on its own node
2N/A * set to OK, then this node checks all of the non-owner
2N/A * nodes that are in the membership list.
2N/A * - for each non-owner, node calls RPC routine clnt_getset to
2N/A * see if that node has its node record set to OK.
2N/A * - If set doesn't exist, don't choose node for master.
2N/A * - If this node doesn't exist in the nodelist on any of the
2N/A * non-owner nodes, this node removes its set description
2N/A * of that diskset (i.e. removes the set from its local
2N/A * mddbs). This is handling the case of when a node was
2N/A * removed from a diskset while it was not in the
2N/A * cluster membership list.
2N/A * - If non-owner node has its node record set to OK and if
2N/A * this node hasn't removed this diskset (step directly
2N/A * before this one), then the master is chosen to be this
2N/A * non-owner node.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Don't consider node that isn't in member list */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* Don't consider owner nodes since none are OK */
2N/A if (nd->nd_flags & MD_MN_NODE_OWN) {
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /*
2N/A * Don't need to get nodelist from my node since
2N/A * this is where sd_nodelist was obtained.
2N/A */
2N/A if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /*
2N/A * If node has already been decided against for
2N/A * master, then skip it.
2N/A */
2N/A if (nd->nd_flags & MD_MN_NODE_DEL) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /*
2N/A * Does node in my nodelist have its own node
2N/A * record marked OK on its node? And does node
2N/A * in my nodelist exist on all other nodes?
2N/A * Don't want to choose a node for master unless
2N/A * that node is marked OK on its own node and that
2N/A * node exists on all other alive nodes.
2N/A *
2N/A * This is guarding against the case when several
2N/A * nodes are down and one of the downed nodes is
2N/A * deleted from the diskset. When the down nodes
2N/A * are rebooted into the cluster, you don't want
2N/A * any node to pick the deleted node as the master.
2N/A */
2N/A if (clnt_mngetset(nd->nd_nodename, sp->setname,
2N/A MD_SET_BAD, &mnsr, ep) == -1) {
2N/A /*
2N/A * If set doesn't exist on non-owner node,
2N/A * don't consider this node for master.
2N/A */
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A nd = nd->nd_next;
2N/A continue;
2N/A } else if (mdanyrpcerror(ep)) {
2N/A /* RPC failure to another node */
2N/A return (205);
2N/A } else {
2N/A /* Any other failure */
2N/A return (-1);
2N/A }
2N/A }
2N/A /*
2N/A * Is my node in the nodelist gotten from the other
2N/A * node? If not, then remove the set from my node
2N/A * since set was deleted from my node while my node
2N/A * was out of the cluster.
2N/A */
2N/A nr = mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if (sd->sd_mn_mynode->nd_nodeid ==
2N/A nr->nr_nodeid) {
2N/A break;
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A if (nr == NULL) {
2N/A /* my node not found - delete set */
2N/A free_sr((md_set_record *)mnsr);
2N/A goto delete_set;
2N/A }
2N/A
2N/A /* Is node being checked marked OK on its own node? */
2N/A nr = mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if (nd->nd_nodeid == nr->nr_nodeid) {
2N/A if (!(nr->nr_flags & MD_MN_NODE_OK)) {
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A }
2N/A break;
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A /*
2N/A * If node being checked doesn't exist on its
2N/A * own node - don't choose it as master.
2N/A */
2N/A if (nr == NULL) {
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A }
2N/A
2N/A /*
2N/A * Check every node in my node's nodelist against
2N/A * the nodelist gotten from the other node.
2N/A * If a node in my node's nodelist is not found in the
2N/A * other node's nodelist, then set the DEL flag.
2N/A */
2N/A nd2 = sd->sd_nodelist;
2N/A while (nd2) {
2N/A nr = mnsr->sr_nodechain;
2N/A while (nr) {
2N/A if (nd2->nd_nodeid == nr->nr_nodeid) {
2N/A break;
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A /* nd2 not found in other node's nodelist */
2N/A if (nr == NULL) {
2N/A nd2->nd_flags |= MD_MN_NODE_DEL;
2N/A }
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A
2N/A free_sr((md_set_record *)mnsr);
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * Rescan list look for node that has not been marked DEL.
2N/A * First node found is the master.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
2N/A break;
2N/A }
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (nd) {
2N/A /* Found a master */
2N/A goto found_master;
2N/A }
2N/A
2N/A /*
2N/A * - If no node can be found that has its own node record on
2N/A * its node to be set to OK, then all alive nodes
2N/A * were in the process of being added to or deleted
2N/A * from set. Each alive node will remove all
2N/A * information pertaining to this set from its node.
2N/A *
2N/A * If all nodes in set are ALIVE, then call sdssc end routines
2N/A * since set was truly being initially created or destroyed.
2N/A */
2N/A goto delete_set;
2N/A }
2N/A
2N/Afound_master:
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Set %s master chosen %s (%d): %s"),
2N/A sp->setname, nd->nd_nodename, nd->nd_nodeid,
2N/A meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A if (clnt_lock_set(mynode(), sp, ep) == -1) {
2N/A return (-1);
2N/A }
2N/A
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A
2N/A if (clnt_mnsetmaster(mynode(), sp,
2N/A nd->nd_nodename, nd->nd_nodeid, ep)) {
2N/A rval = -1;
2N/A } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
2N/A /* If this node is new master, set flag in this node's kernel */
2N/A (void) memset(&sf, 0, sizeof (sf));
2N/A sf.sf_setno = sp->setno;
2N/A sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
2N/A /* Use magic to help protect ioctl against attack. */
2N/A sf.sf_magic = MDDB_SETFLAGS_MAGIC;
2N/A sf.sf_flags = MDDB_NM_SET;
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Setting new master flag for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /*
2N/A * Fail reconfig cycle if ioctl fails since it is critical
2N/A * to set new master flag.
2N/A */
2N/A if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
2N/A NULL) != NULL) {
2N/A (void) mdstealerror(ep, &sf.sf_mde);
2N/A rval = -1;
2N/A }
2N/A }
2N/A
2N/A if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
2N/A if (rval == 0) {
2N/A (void) mdstealerror(ep, &xep);
2N/A rval = -1;
2N/A }
2N/A }
2N/A
2N/A cl_set_setkey(NULL);
2N/A
2N/A metaflushsetname(sp);
2N/A
2N/A return (rval);
2N/A
2N/Adelete_set:
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Master not chosen, deleting set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /*
2N/A * Remove all set information from this node:
2N/A * - node records for this set
2N/A * - drive records for this set
2N/A * - set record for this set
2N/A * (Only do this on this node since each node
2N/A * will do it for its own local mddb.)
2N/A *
2N/A * If all nodes in set are ALIVE, then
2N/A * the lowest numbered ALIVE nodeid in set
2N/A * (irregardless of whether an owner node or not) will
2N/A * call the DCS service to cleanup for create/delete of set.
2N/A * sdssc_create_end(cleanup) if set was being created or
2N/A * sdssc_delete_end(cleanup) if set was being deleted.
2N/A * A node record with flag ADD denotes a set being
2N/A * created. A node record with flag DEL denotes a
2N/A * set being deleted.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Found a node that isn't alive */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
2N/A break;
2N/A
2N/A /* Is my node the lowest numbered ALIVE node? */
2N/A if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
2N/A break;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A if (nd == NULL) {
2N/A /* All nodes ALIVE and this is the lowest nodeid */
2N/A lowest_alive_nodeid = 1;
2N/A }
2N/A
2N/A if (clnt_lock_set(mynode(), sp, ep) == -1) {
2N/A return (-1);
2N/A }
2N/A
2N/A
2N/A /*
2N/A * If this node had been joined, withdraw and reset master.
2N/A *
2N/A * This could happen if a node was being added to or removed
2N/A * from a diskset and the node doing the add/delete operation and
2N/A * all other nodes in the diskset have left the cluster.
2N/A */
2N/A if (sd->sd_mn_mynode) {
2N/A nd = sd->sd_mn_mynode;
2N/A if (nd->nd_flags & MD_MN_NODE_OWN) {
2N/A if (clnt_withdrawset(mynode(), sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A if (clnt_mnsetmaster(mynode(), sp, "",
2N/A MD_MN_INVALID_NID, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Remove side records for this node (side) from local mddb
2N/A * (clnt_deldrvs does this) if there are drives in the set.
2N/A *
2N/A * Don't need to mark this node as DEL since already marked as
2N/A * ADD or DEL (or this node would have been chosen as master).
2N/A * Don't need to mark other node records, drive records or
2N/A * set records as DEL. If a panic occurs during clnt_delset,
2N/A * these records will be deleted the next time this node
2N/A * becomes a member and goes through the reconfig cycle.
2N/A */
2N/A /* Get the drive descriptors for this set */
2N/A if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2N/A ep)) == NULL) {
2N/A if (! mdisok(ep)) {
2N/A /*
2N/A * Ignore and clear out any failures from
2N/A * metaget_drivedesc since a panic could have
2N/A * occurred when a node was partially added to a set.
2N/A */
2N/A mdclrerror(ep);
2N/A }
2N/A } else {
2N/A if (clnt_deldrvs(mynode(), sp, dd, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * Now, delete the set - this removes the node, drive
2N/A * and set records from the local mddb.
2N/A */
2N/A if (clnt_delset(mynode(), sp, ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/Aout:
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A
2N/A /*
2N/A * Ignore errors from unlock of set since set is no longer
2N/A * known (if clnt_delset worked).
2N/A */
2N/A if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
2N/A mdclrerror(&xep);
2N/A }
2N/A
2N/A cl_set_setkey(NULL);
2N/A
2N/A metaflushsetname(sp);
2N/A
2N/A /*
2N/A * If this node is the lowest numbered nodeid then
2N/A * call sdssc_create/delete_end depending on whether
2N/A * this node is marked as ADD or DEL in the node record.
2N/A */
2N/A if (lowest_alive_nodeid) {
2N/A if (nd->nd_flags & MD_MN_NODE_ADD)
2N/A sdssc_create_end(sp->setname, SDSSC_CLEANUP);
2N/A else if (nd->nd_flags & MD_MN_NODE_DEL)
2N/A sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
2N/A }
2N/A
2N/A /* Finished with this set -- return */
2N/A return (rval);
2N/A}
2N/A
2N/A/*
2N/A * Reconfig step to choose a new master for all MN disksets.
2N/A * Return values:
2N/A * 0 - Everything is great.
2N/A * 1 - This node failed to reconfig.
2N/A * 205 - Cause another reconfig due to a nodelist problem
2N/A * or RPC failure to another node
2N/A */
2N/Aint
2N/Ameta_reconfig_choose_master(
2N/A long timeout,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A set_t max_sets, setno;
2N/A int nodecnt;
2N/A mndiskset_membershiplist_t *nl;
2N/A md_set_desc *sd;
2N/A mdsetname_t *sp;
2N/A int rval = 0;
2N/A mddb_setflags_config_t sf;
2N/A int start_node_delayed = 0;
2N/A
2N/A if ((max_sets = get_max_sets(ep)) == 0) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to get number of sets"));
2N/A return (1);
2N/A }
2N/A
2N/A /*
2N/A * Get membershiplist from API routine. If there's
2N/A * an error, return a 205 to cause another reconfig.
2N/A */
2N/A if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2N/A mde_perror(ep, "");
2N/A return (205);
2N/A }
2N/A
2N/A for (setno = 1; setno < max_sets; setno++) {
2N/A if ((sp = metasetnosetname(setno, ep)) == NULL) {
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A /* No set for this setno - continue */
2N/A mdclrerror(ep);
2N/A continue;
2N/A } else {
2N/A /*
2N/A * If encountered an RPC error from my node,
2N/A * then immediately fail.
2N/A */
2N/A if (mdanyrpcerror(ep)) {
2N/A mde_perror(ep, "");
2N/A return (1);
2N/A }
2N/A /* Can't get set information */
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to get information for "
2N/A "set number %d"), setno);
2N/A mdclrerror(ep);
2N/A continue;
2N/A }
2N/A }
2N/A
2N/A /* If setname is there, set desc should exist. */
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A /*
2N/A * If encountered an RPC error from my node,
2N/A * then immediately fail.
2N/A */
2N/A if (mdanyrpcerror(ep)) {
2N/A mde_perror(ep, "");
2N/A return (1);
2N/A }
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to get set %s desc information"),
2N/A sp->setname);
2N/A mdclrerror(ep);
2N/A continue;
2N/A }
2N/A
2N/A /* Only reconfig MN disksets */
2N/A if (!MD_MNSET_DESC(sd)) {
2N/A continue;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Begin choose master for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /* Update nodelist with member information. */
2N/A if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
2N/A /*
2N/A * If encountered an RPC error from my node,
2N/A * then immediately fail.
2N/A */
2N/A if (mdanyrpcerror(ep)) {
2N/A mde_perror(ep, "");
2N/A return (1);
2N/A }
2N/A mde_perror(ep, "");
2N/A mdclrerror(ep);
2N/A continue;
2N/A }
2N/A
2N/A /*
2N/A * If all nodes in a cluster are starting, then
2N/A * all nodes will attempt to contact all other nodes
2N/A * to determine a master node. This can lead to a
2N/A * problem where node 1 is trying to contact the rpc.metad
2N/A * node 2 and node 2 is trying to contact the rpc.metad
2N/A * on node 1 -- and this causes the rpc call to fail
2N/A * on both nodes and causes a new reconfig cycle.
2N/A *
2N/A * In order to break this problem, a newly starting node
2N/A * will delay a small amount of time (nodeid mod 4 seconds)
2N/A * and will then run the code to choose a master for the
2N/A * first set. Delay will only be done once regardless of the
2N/A * number of sets.
2N/A */
2N/A if (start_node_delayed == 0) {
2N/A (void) memset(&sf, 0, sizeof (sf));
2N/A sf.sf_setno = sp->setno;
2N/A sf.sf_flags = MDDB_NM_GET;
2N/A /* Use magic to help protect ioctl against attack. */
2N/A sf.sf_magic = MDDB_SETFLAGS_MAGIC;
2N/A if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
2N/A &sf.sf_mde, NULL) == 0) &&
2N/A ((sf.sf_setflags & MD_SET_MN_START_RC) ==
2N/A MD_SET_MN_START_RC)) {
2N/A (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
2N/A }
2N/A start_node_delayed = 1;
2N/A }
2N/A
2N/A /* Choose master for this set */
2N/A rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
2N/A if (rval == -1) {
2N/A mde_perror(ep, "");
2N/A return (1);
2N/A } else if (rval == 205) {
2N/A mde_perror(ep, "");
2N/A return (205);
2N/A }
2N/A
2N/A /* reinit rpc.mdcommd with new nodelist */
2N/A if (mdmn_reinit_set(sp->setno, timeout)) {
2N/A md_eprintf(dgettext(TEXT_DOMAIN,
2N/A "Could not re-initialise rpc.mdcommd for "
2N/A "set %s\n"), sp->setname);
2N/A return (1);
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Choose master for set %s completed: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A }
2N/A
2N/A /*
2N/A * Each node turns on I/Os for all MN disksets.
2N/A * This is to recover from the situation where the master died
2N/A * during a previous reconfig cycle when I/Os were suspended
2N/A * for a MN diskset.
2N/A * If a failure occurs return a 1 which will force this node to
2N/A * panic. Cannot leave node in the situation where I/Os are
2N/A * not resumed.
2N/A */
2N/A setno = 0; /* 0 means all MN sets */
2N/A if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
2N/A mde_perror(ep, "");
2N/A return (1);
2N/A }
2N/A
2N/A /* Free the nodelist */
2N/A if (nodecnt)
2N/A meta_free_nodelist(nl);
2N/A
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * meta_mnsync_user_records will synchronize the diskset user records across
2N/A * all nodes in the diskset. The diskset user records are stored in
2N/A * each node's local set mddb.
2N/A *
2N/A * This needs to be done even if there is no master change during the
2N/A * reconfig cycle since this routine should clean up any mess left by
2N/A * the untimely termination of a metaset or metadb command (due to a
2N/A * node panic or to user intervention).
2N/A *
2N/A * Caller is the Master node.
2N/A *
2N/A * Returns 0 - Success
2N/A * 205 - Failure during RPC to another node
2N/A * -1 - Any other failure and ep is filled in.
2N/A */
2N/Aint
2N/Ameta_mnsync_user_records(
2N/A mdsetname_t *sp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail;
2N/A md_mnset_record *mnsr;
2N/A md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL;
2N/A md_mnnode_record *nr;
2N/A md_drive_record *dr;
2N/A int dr_cnt, dd_cnt;
2N/A int found_my_nr;
2N/A md_drive_desc *dd, *dd_prev, *master_dd, *other_dd;
2N/A int all_drives_ok;
2N/A int rval = 0;
2N/A int max_genid = 0;
2N/A int num_alive_nodes, num_alive_nodes_del = 0;
2N/A int set_locked = 0;
2N/A md_setkey_t *cl_sk;
2N/A md_error_t xep = mdnullerror;
2N/A char *anode[1];
2N/A mddb_setflags_config_t sf;
2N/A
2N/A /*
2N/A * Sync up node records first.
2N/A * Construct a master nodelist using the nodelist from this
2N/A * node's rpc.metad node records and then setting the state of each
2N/A * node following these rules:
2N/A * - If a node record is marked OK on its node, mark it OK
2N/A * in the master nodelist (and later OK on all nodes)
2N/A * If a node record is also marked OWN on its node,
2N/A * mark it OWN in the master nodelist.
2N/A * - If a node record is not marked OK on its node, then mark
2N/A * it as DEL in the master list (later deleting it)
2N/A * - If node record doesn't exist on that node, then mark it DEL
2N/A * (later deleting it)
2N/A * - If set record doesn't exist on that node, mark node as DEL
2N/A * - If a node record doesn't exist on all nodes, then mark it DEL
2N/A * - If a node is not ALIVE, then
2N/A * - If that node marked DEL on any node - mark it DEL
2N/A * in master list but leave in nodelist
2N/A * - If that node is marked as ADD on any node, mark it
2N/A * ADD in the master list but leave in nodelist
2N/A * - When that node returns to the living, the DEL
2N/A * node record will be removed and the ADD node
2N/A * record may be removed if marked ADD on that
2N/A * node.
2N/A * The key rule is to not remove a node from the nodelist until
2N/A * that node record is removed from its own node. Do not want to
2N/A * remove a node's record from all other nodes and then have
2N/A * that node have its own record marked OK so that a node will pick
2N/A * a different master than the other nodes.
2N/A *
2N/A * Next,
2N/A * If node is ALIVE and node record is marked DEL in master nodelist,
2N/A * remove node from set.
2N/A * If node is ALIVE and node record is marked OK in master nodelist,
2N/A * mark it OK on all other nodes.
2N/A * If node is not ALIVE and node record is marked DEL in master
2N/A * nodelist, mark it DEL on all other nodes.
2N/A * If node is not ALIVE and node record is marked ADD in master,
2N/A * nodelist, mark it ADD on all other nodes.
2N/A */
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A return (-1);
2N/A }
2N/A master_nodelist = sd->sd_nodelist;
2N/A
2N/A /*
2N/A * Walk through nodelist creating a master nodelist.
2N/A */
2N/A num_alive_nodes = 0;
2N/A nd = master_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A num_alive_nodes++;
2N/A if (clnt_mngetset(nd->nd_nodename, sp->setname,
2N/A MD_SET_BAD, &mnsr, ep) == -1) {
2N/A if (mdiserror(ep, MDE_NO_SET)) {
2N/A /* set doesn't exist, mark node as DEL */
2N/A nd->nd_flags &= ~MD_MN_NODE_OK;
2N/A nd->nd_flags &= ~MD_MN_NODE_ADD;
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A nd->nd_flags |= MD_MN_NODE_NOSET;
2N/A nd = nd->nd_next;
2N/A continue;
2N/A } else {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A }
2N/A /* Find biggest genid in records for this diskset */
2N/A if (mnsr->sr_genid > max_genid)
2N/A max_genid = mnsr->sr_genid;
2N/A
2N/A dr = mnsr->sr_drivechain;
2N/A while (dr) {
2N/A /* Find biggest genid in records for this diskset */
2N/A if (dr->dr_genid > max_genid) {
2N/A max_genid = dr->dr_genid;
2N/A }
2N/A dr = dr->dr_next;
2N/A }
2N/A
2N/A found_my_nr = 0;
2N/A nr = mnsr->sr_nodechain;
2N/A /* nr is the list of node recs from nd_nodename node */
2N/A while (nr) {
2N/A /* Find biggest genid in records for this diskset */
2N/A if (nr->nr_genid > max_genid)
2N/A max_genid = nr->nr_genid;
2N/A nd2 = master_nodelist;
2N/A ndtail = NULL;
2N/A /* For each node record, is it in master list? */
2N/A while (nd2) {
2N/A if (nd2->nd_nodeid == nr->nr_nodeid)
2N/A break;
2N/A if (nd2->nd_next == NULL)
2N/A ndtail = nd2;
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A /*
2N/A * Found node record not in master list -- add it
2N/A * to list marking it as DEL since node record
2N/A * should exist on all nodes unless a panic occurred
2N/A * during addition or deletion of host to diskset.
2N/A */
2N/A if (nd2 == NULL) {
2N/A nd2 = Zalloc(sizeof (*nd2));
2N/A (void) strcpy(nd2->nd_nodename,
2N/A nr->nr_nodename);
2N/A nd2->nd_flags = nr->nr_flags;
2N/A nd2->nd_flags |= MD_MN_NODE_DEL;
2N/A nd2->nd_nodeid = nr->nr_nodeid;
2N/A nd2->nd_next = NULL;
2N/A ndtail->nd_next = nd2;
2N/A nd2 = NULL;
2N/A nr = nr->nr_next;
2N/A continue;
2N/A }
2N/A /*
2N/A * Is this the node record for the node that
2N/A * we requested the set desc from?
2N/A * If so, check if node has its own node record
2N/A * marked OK. If marked OK, check for the OWN bit.
2N/A */
2N/A if (nr->nr_nodeid == nd->nd_nodeid) {
2N/A found_my_nr = 1;
2N/A if (nr->nr_flags & MD_MN_NODE_OK) {
2N/A /*
2N/A * If node record is marked OK
2N/A * on its own node, then mark it OK
2N/A * in the master list. Node record
2N/A * would have to exist on all nodes
2N/A * in the ADD state before it could
2N/A * be put into the OK state.
2N/A */
2N/A nd->nd_flags |= MD_MN_NODE_OK;
2N/A nd->nd_flags &=
2N/A ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
2N/A /*
2N/A * Mark own in master list as marked
2N/A * on own node.
2N/A */
2N/A if (nr->nr_flags & MD_MN_NODE_OWN)
2N/A nd->nd_flags |= MD_MN_NODE_OWN;
2N/A else
2N/A nd->nd_flags &= ~MD_MN_NODE_OWN;
2N/A } else {
2N/A /* Otherwise, mark node as DEL */
2N/A nd->nd_flags &= ~MD_MN_NODE_OK;
2N/A nd->nd_flags &= ~MD_MN_NODE_ADD;
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A }
2N/A }
2N/A /*
2N/A * If node is not ALIVE and marked DEL
2N/A * on any node, make it DEL in master list.
2N/A * If node is not ALIVE and marked ADD
2N/A * on any node, make it ADD in master list
2N/A * unless node record has already been marked DEL.
2N/A */
2N/A if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
2N/A if (nr->nr_flags & MD_MN_NODE_ADD) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
2N/A /* If not DEL - mark it ADD */
2N/A nd->nd_flags |= MD_MN_NODE_ADD;
2N/A nd->nd_flags &= ~MD_MN_NODE_OK;
2N/A }
2N/A }
2N/A if (nr->nr_flags & MD_MN_NODE_DEL) {
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A nd->nd_flags &= ~MD_MN_NODE_OK;
2N/A /* Could already be ADD - make it DEL */
2N/A nd->nd_flags &= ~MD_MN_NODE_ADD;
2N/A }
2N/A }
2N/A nr = nr->nr_next;
2N/A }
2N/A /*
2N/A * If a node record doesn't exist on its own node,
2N/A * then mark node as DEL.
2N/A */
2N/A if (found_my_nr == 0) {
2N/A nd->nd_flags &= ~MD_MN_NODE_OK;
2N/A nd->nd_flags |= MD_MN_NODE_DEL;
2N/A }
2N/A
2N/A /*
2N/A * If node is OK - put mnsr onto master_mnsr_node list for
2N/A * later use when syncing up the drive records in the set.
2N/A */
2N/A if (nd->nd_flags & MD_MN_NODE_OK) {
2N/A mnsr_node = Zalloc(sizeof (*mnsr_node));
2N/A mnsr_node->mmn_mnsr = mnsr;
2N/A (void) strncpy(mnsr_node->mmn_nodename,
2N/A nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
2N/A mnsr_node->mmn_next = master_mnsr_node;
2N/A master_mnsr_node = mnsr_node;
2N/A } else {
2N/A free_sr((struct md_set_record *)mnsr);
2N/A }
2N/A
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Master nodelist created for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /*
2N/A * Send master nodelist to the rpc.metad on all nodes (including
2N/A * myself) and each node will update itself. This will set the
2N/A * ADD and DEL flags on each node as setup in the master nodelist.
2N/A * Don't send nodelist to node where set doesn't exist.
2N/A */
2N/A nd = master_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
2N/A (nd->nd_flags & MD_MN_NODE_NOSET)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2N/A master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * Now, delete nodes that need to be deleted.
2N/A */
2N/A if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2N/A ep)) == NULL) {
2N/A if (! mdisok(ep)) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * May be doing lots of RPC commands to the nodes, so lock the
2N/A * ALIVE members of the set since most of the rpc.metad routines
2N/A * require this for security reasons.
2N/A */
2N/A nd = master_nodelist;
2N/A while (nd) {
2N/A /* Skip non-alive nodes and node without set */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
2N/A (nd->nd_flags & MD_MN_NODE_NOSET)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A set_locked = 1;
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A nd = master_nodelist;
2N/A while (nd) {
2N/A /* Skip non-alive nodes */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (nd->nd_flags & MD_MN_NODE_DEL) {
2N/A num_alive_nodes_del++;
2N/A /*
2N/A * Delete this node rec from all ALIVE nodes in diskset.
2N/A */
2N/A nd2 = master_nodelist;
2N/A while (nd2) {
2N/A /* Skip non-alive nodes and node without set */
2N/A if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
2N/A (nd2->nd_flags & MD_MN_NODE_NOSET)) {
2N/A nd2 = nd2->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A /* This is a node being deleted from set */
2N/A if (nd2->nd_nodeid == nd->nd_nodeid) {
2N/A /* Mark set record as DEL */
2N/A if (clnt_upd_sr_flags(nd->nd_nodename,
2N/A sp, MD_SR_DEL, ep)) {
2N/A /* RPC failure to !my node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->
2N/A nd_nodeid
2N/A != nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A if (clnt_deldrvs(nd->nd_nodename, sp,
2N/A dd, ep)) {
2N/A /* RPC failure to !my node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->
2N/A nd_nodeid
2N/A != nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A if (clnt_delset(nd->nd_nodename, sp,
2N/A ep) == -1) {
2N/A /* RPC failure to !my node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->
2N/A nd_nodeid
2N/A != nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A } else {
2N/A /*
2N/A * Delete host from sets on hosts
2N/A * not being deleted.
2N/A */
2N/A anode[0] = Strdup(nd->nd_nodename);
2N/A if (clnt_delhosts(nd2->nd_nodename, sp,
2N/A 1, anode, ep) == -1) {
2N/A Free(anode[0]);
2N/A /* RPC failure to !my node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->
2N/A nd_nodeid
2N/A != nd2->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5,
2N/A dgettext(TEXT_DOMAIN,
2N/A "Deleted node %s (%d) on node %s "
2N/A "from set %s: %s"),
2N/A nd->nd_nodename, nd->nd_nodeid,
2N/A nd2->nd_nodename,
2N/A sp->setname,
2N/A meta_print_hrtime(
2N/A gethrtime() - start_time));
2N/A
2N/A Free(anode[0]);
2N/A }
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A nd = master_nodelist;
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A while (nd) {
2N/A /* Skip non-alive nodes and node without set */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
2N/A (nd->nd_flags & MD_MN_NODE_NOSET)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
2N/A /* If RPC failure to another node return 205 */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A cl_set_setkey(NULL);
2N/A set_locked = 0;
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Nodelist syncronization complete for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A metaflushsetname(sp);
2N/A
2N/A /*
2N/A * If all alive nodes have been deleted from set, just
2N/A * return since nothing else can be done until non-alive
2N/A * nodes (if there are any) rejoin the cluster.
2N/A */
2N/A if (num_alive_nodes == num_alive_nodes_del) {
2N/A rval = 0;
2N/A goto out;
2N/A }
2N/A
2N/A /*
2N/A * Sync up drive records.
2N/A *
2N/A * If a node panic'd (or metaset command was killed) during the
2N/A * addition or deletion of a drive to the diskset, the nodes
2N/A * may have a different view of the drive list. During cleanup
2N/A * of the drive list during reconfig, a drive will be deleted
2N/A * from the list if the master node sees that the drive has been
2N/A * marked in the ADD state on any node or is marked in the DEL state
2N/A * on all nodes.
2N/A * This cleanup must occur even if all nodes in the cluster are
2N/A * not part of the cluster so that all nodes have the same view
2N/A * of the drivelist.
2N/A * Then if the entire cluster goes down and comes back up, the
2N/A * new master node could be a node that wasn't in the cluster when
2N/A * the node was deleted. This could lead to a situation where the
2N/A * master node thinks that a drive is OK, but this drive isn't
2N/A * known to the other nodes.
2N/A * This situation can also occur during the addition of a drive
2N/A * where a node has the drive marked OK, but the node executing the
2N/A * metaset command enountered a failure before marking that drive OK
2N/A * on the rest of the nodes. If the node with the OK drive then
2N/A * panics, then rest of the nodes will remove that drive marked ADD
2N/A * and when the node with the OK drive rejoins the cluster, it will
2N/A * have a drive marked OK that is unknown by the other nodes.
2N/A *
2N/A * There are 2 situations to consider:
2N/A * A) Master knows about a drive that other nodes don't know about.
2N/A * B) At least one slave node knows about a drive that the master
2N/A * node doesn't know about.
2N/A *
2N/A * To handle these situations the following steps are followed:
2N/A * 1) Count number of drives known by this master node and the
2N/A * other slave nodes.
2N/A * If all nodes have the same number of drives and the master has
2N/A * all drives marked OK, then skip to step4.
2N/A *
2N/A * 2) If a node has less drives listed than the master, the master
2N/A * must get the drive descriptor list from that node so that
2N/A * master can determine which drive it needs to delete from that
2N/A * node. Master must get the drive descriptor list since the
2N/A * drive record list does not contain the name of the drive, but
2N/A * only a key and the key can only be interprested on that other
2N/A * node.
2N/A *
2N/A * 3) The master will then create the master drive list by doing:
2N/A * - Master starts with drive list known by master.
2N/A * - Any drive marked ADD will be removed from the list.
2N/A * - Any drive not known by another node (from step2) will be
2N/A * removed from the drive list.
2N/A * - If a drive is marked DEL on the master, the master must
2N/A * verify that the drive record is marked DEL on all nodes.
2N/A * If any node has the drive record marked OK, mark it OK
2N/A * on the master. (The reason why is described below).
2N/A *
2N/A * 4) The master sends out the master drive list and the slave
2N/A * nodes will force their drive lists to match the master
2N/A * drive list by deleting drives, if necessary and by changing
2N/A * the drive record states from ADD->OK if master has drive
2N/A * marked OK and slave has drive marked ADD.
2N/A *
2N/A * Interesting scenarios:
2N/A *
2N/A * 1) System has 4 nodes with node 1 as the master. Node 3 starts
2N/A * to delete a drive record (drive record on node 1 is marked DEL),
2N/A * but is stopped when node 3 panics. Node 1 also panics.
2N/A * During reconfig cycle, node 2 is picked as master and the drive
2N/A * record is left alone since all nodes in the cluster have it
2N/A * marked OK. User now sees drive as part of diskset.
2N/A * Now, entire cluster is rebooted and node 1 rejoins the cluster.
2N/A * Node 1 is picked as the master and node 1 has drive record
2N/A * marked DEL. Node 1 contacts all other nodes in the cluster
2N/A * and since at least one node has the drive record marked OK,
2N/A * the master marks the drive record OK.
2N/A * User continues to see the drive as part of the diskset.
2N/A */
2N/A
2N/A /* Reget set descriptor since flushed above */
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A
2N/A /* Has side effect of setting sd->sd_drvs to same as master_dd */
2N/A if ((master_dd = metaget_drivedesc_sideno(sp,
2N/A sd->sd_mn_mynode->nd_nodeid,
2N/A (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
2N/A /* No drives in list */
2N/A if (!mdisok(ep)) {
2N/A /*
2N/A * Can't get drive list for this node, so
2N/A * return -1 causing this node to be removed
2N/A * cluster config and fixed.
2N/A */
2N/A rval = -1;
2N/A goto out;
2N/A }
2N/A }
2N/A
2N/A /* Count the number of drives for all nodes */
2N/A mnsr_node = master_mnsr_node;
2N/A while (mnsr_node) {
2N/A dr_cnt = 0;
2N/A dr = mnsr_node->mmn_mnsr->sr_drivechain;
2N/A while (dr) {
2N/A dr_cnt++;
2N/A dr = dr->dr_next;
2N/A }
2N/A mnsr_node->mmn_numdrives = dr_cnt;
2N/A mnsr_node = mnsr_node->mmn_next;
2N/A }
2N/A
2N/A /* Count the number of drives for the master; also check flags */
2N/A all_drives_ok = 1;
2N/A dd_cnt = 0;
2N/A dd = master_dd;
2N/A while (dd) {
2N/A dd_cnt++;
2N/A if (!(dd->dd_flags & MD_DR_OK))
2N/A all_drives_ok = 0;
2N/A dd = dd->dd_next;
2N/A }
2N/A
2N/A /* If all drives are ok, do quick check against number of drives */
2N/A if (all_drives_ok) {
2N/A /* If all nodes have same number of drives, almost done */
2N/A mnsr_node = master_mnsr_node;
2N/A while (mnsr_node) {
2N/A if (mnsr_node->mmn_numdrives != dd_cnt)
2N/A break;
2N/A mnsr_node = mnsr_node->mmn_next;
2N/A }
2N/A /* All nodes have same number of drives, just send flags */
2N/A if (mnsr_node == NULL) {
2N/A goto send_drive_list;
2N/A }
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Begin detailed drive synchronization for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /* Detailed check required */
2N/A mnsr_node = master_mnsr_node;
2N/A while (mnsr_node) {
2N/A /* Does slave node have less drives than master? */
2N/A if (mnsr_node->mmn_numdrives < dd_cnt) {
2N/A /* Yes - must determine which drive is missing */
2N/A if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
2N/A &other_dd, ep)) {
2N/A /* RPC failure to !my node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (strcmp(mynode(), mnsr_node->mmn_nodename)
2N/A != 0)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Master node %s unable to "
2N/A "retrieve drive list from node %s"),
2N/A mynode(), mnsr_node->mmn_nodename);
2N/A goto out;
2N/A }
2N/A mnsr_node->mmn_dd = other_dd;
2N/A dd = master_dd;
2N/A while (dd) {
2N/A if (!(dd->dd_flags & MD_DR_OK)) {
2N/A dd = dd->dd_next;
2N/A continue;
2N/A }
2N/A other_dd = mnsr_node->mmn_dd;
2N/A while (other_dd) {
2N/A /* Convert to devids, when available */
2N/A if (strcmp(other_dd->dd_dnp->cname,
2N/A dd->dd_dnp->cname) == 0) {
2N/A break;
2N/A }
2N/A other_dd = other_dd->dd_next;
2N/A }
2N/A /*
2N/A * dd not found on slave so mark it
2N/A * ADD for later deletion (drives in ADD
2N/A * state are deleted later in this routine).
2N/A */
2N/A if (other_dd == NULL) {
2N/A dd->dd_flags = MD_DR_ADD;
2N/A }
2N/A dd = dd->dd_next;
2N/A }
2N/A
2N/A }
2N/A mnsr_node = mnsr_node->mmn_next;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Drive check completed for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A dd = master_dd;
2N/A dd_prev = 0;
2N/A while (dd) {
2N/A /* Remove any ADD drives from list */
2N/A if (dd->dd_flags & MD_DR_ADD) {
2N/A if (dd_prev) {
2N/A dd_prev->dd_next = dd->dd_next;
2N/A dd->dd_next = NULL;
2N/A metafreedrivedesc(&dd);
2N/A dd = dd_prev->dd_next;
2N/A } else {
2N/A /*
2N/A * If removing drive descriptor from head
2N/A * of linked list, also change sd->sd_drvs.
2N/A */
2N/A master_dd = sd->sd_drvs = dd->dd_next;
2N/A dd->dd_next = NULL;
2N/A metafreedrivedesc(&dd);
2N/A dd = master_dd;
2N/A }
2N/A /* dd setup in if/else above */
2N/A continue;
2N/A }
2N/A /*
2N/A * If drive is marked DEL, check all other nodes.
2N/A * If drive on another node is marked OK, mark drive OK
2N/A * in master list. If drive is marked DEL or doesn't exist
2N/A * on all nodes, remove drive from list.
2N/A */
2N/A if (dd->dd_flags & MD_DR_DEL) {
2N/A mnsr_node = master_mnsr_node;
2N/A while (mnsr_node) {
2N/A if (mnsr_node->mmn_dd == NULL) {
2N/A if (clnt_getdrivedesc(
2N/A mnsr_node->mmn_nodename, sp,
2N/A &other_dd, ep)) {
2N/A /* RPC failure to !my node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (strcmp(mynode(),
2N/A mnsr_node->mmn_nodename)
2N/A != 0)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A mde_perror(ep,
2N/A dgettext(TEXT_DOMAIN,
2N/A "Master node %s unable "
2N/A "to retrieve drive list "
2N/A "from node %s"), mynode(),
2N/A mnsr_node->mmn_nodename);
2N/A goto out;
2N/A }
2N/A mnsr_node->mmn_dd = other_dd;
2N/A }
2N/A other_dd = mnsr_node->mmn_dd;
2N/A while (other_dd) {
2N/A /* Found drive (OK) from other node */
2N/A if (strcmp(dd->dd_dnp->cname,
2N/A other_dd->dd_dnp->cname)
2N/A == 0) {
2N/A /* Drive marked OK */
2N/A if (other_dd->dd_flags &
2N/A MD_DR_OK) {
2N/A dd->dd_flags = MD_DR_OK;
2N/A }
2N/A break;
2N/A }
2N/A other_dd = other_dd->dd_next;
2N/A }
2N/A if (dd->dd_flags == MD_DR_OK)
2N/A break;
2N/A
2N/A mnsr_node = mnsr_node->mmn_next;
2N/A }
2N/A /*
2N/A * If no node had this drive marked OK, delete it.
2N/A */
2N/A if (dd->dd_flags & MD_DR_DEL) {
2N/A if (dd_prev) {
2N/A dd_prev->dd_next = dd->dd_next;
2N/A dd->dd_next = NULL;
2N/A metafreedrivedesc(&dd);
2N/A dd = dd_prev->dd_next;
2N/A } else {
2N/A /*
2N/A * If removing drive descriptor from
2N/A * head of linked list, also change
2N/A * sd->sd_drvs.
2N/A */
2N/A master_dd = sd->sd_drvs = dd->dd_next;
2N/A dd->dd_next = NULL;
2N/A metafreedrivedesc(&dd);
2N/A dd = master_dd;
2N/A }
2N/A /* dd setup in if/else above */
2N/A continue;
2N/A }
2N/A }
2N/A dd_prev = dd;
2N/A dd = dd->dd_next;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Setting drive states completed for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/Asend_drive_list:
2N/A /*
2N/A * Set genid on all drives to be the highest value seen.
2N/A */
2N/A dd = master_dd;
2N/A while (dd) {
2N/A dd->dd_genid = max_genid;
2N/A dd = dd->dd_next;
2N/A }
2N/A /*
2N/A * Send updated drive list to all alive nodes.
2N/A * Will also set genid on set and node records to have same
2N/A * as the drive records.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Skip non-alive nodes */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
2N/A /* RPC failure to another node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Sent drive list to all nodes for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /*
2N/A * If no drive records left in set and nodes had been joined,
2N/A * withdraw the nodes. Always reset the master and mark
2N/A * all nodes as withdrawn on all nodes.
2N/A */
2N/A if (master_dd == NULL) {
2N/A /* Reset new master flag since no longer master */
2N/A (void) memset(&sf, 0, sizeof (sf));
2N/A sf.sf_setno = sp->setno;
2N/A sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
2N/A sf.sf_flags = MDDB_NM_RESET;
2N/A /* Use magic to help protect ioctl against attack. */
2N/A sf.sf_magic = MDDB_SETFLAGS_MAGIC;
2N/A /* Ignore failure, failure to reset flag isn't catastrophic */
2N/A (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
2N/A &sf.sf_mde, NULL);
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Reset new master flag for " "set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Skip non-alive nodes */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A
2N/A if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2N/A /* RPC failure to another node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A set_locked = 1;
2N/A
2N/A /* Withdraw node from set if owner */
2N/A if ((nd->nd_flags & MD_MN_NODE_OWN) &&
2N/A (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
2N/A /* RPC failure to another node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A
2N/A /* Mark all nodes as withdrawn on this node */
2N/A if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2N/A sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
2N/A /* RPC failure to another node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A
2N/A /* Resets master to no-master on this node */
2N/A if (clnt_mnsetmaster(nd->nd_nodename, sp,
2N/A "", MD_MN_INVALID_NID, ep)) {
2N/A /* RPC failure to another node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
2N/A /* RPC failure to another node */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A rval = 205;
2N/A } else {
2N/A /* Any other failure */
2N/A rval = -1;
2N/A }
2N/A goto out;
2N/A }
2N/A set_locked = 0;
2N/A nd = nd->nd_next;
2N/A }
2N/A }
2N/A
2N/Aout:
2N/A /*
2N/A * If got here and set is still locked, then an error has
2N/A * occurred and master_nodelist is still valid.
2N/A * If error is not an RPC error, then unlock.
2N/A * If error is an RPC error, skip unlocks since this could cause
2N/A * yet another RPC timeout if a node has failed.
2N/A * Ignore failures in unlock since unlock is just trying to
2N/A * clean things up.
2N/A */
2N/A if ((set_locked) && !(mdanyrpcerror(ep))) {
2N/A nd = master_nodelist;
2N/A cl_sk = cl_get_setkey(sp->setno, sp->setname);
2N/A while (nd) {
2N/A /* Skip non-alive nodes */
2N/A if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A /*
2N/A * If clnt_unlock fails, just break out since next
2N/A * reconfig cycle will reset the locks anyway.
2N/A */
2N/A if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2N/A break;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A cl_set_setkey(NULL);
2N/A }
2N/A /* Free master_mnsr and drive descs */
2N/A mnsr_node = master_mnsr_node;
2N/A while (mnsr_node) {
2N/A master_mnsr_node = mnsr_node->mmn_next;
2N/A free_sr((md_set_record *)mnsr_node->mmn_mnsr);
2N/A free_rem_dd(mnsr_node->mmn_dd);
2N/A Free(mnsr_node);
2N/A mnsr_node = master_mnsr_node;
2N/A }
2N/A
2N/A /* Frees sd->sd_drvs (which is also master_dd) */
2N/A metaflushsetname(sp);
2N/A return (rval);
2N/A}
2N/A
2N/A/*
2N/A * meta_mnsync_diskset_mddbs
2N/A * Calling node is guaranteed to be an owner node.
2N/A * Calling node is the master node.
2N/A *
2N/A * Master node verifies that ondisk mddb format matches its incore format.
2N/A * If no nodes are joined to set, remove the change log entries.
2N/A * If a node is joined to set, play the change log.
2N/A *
2N/A * Returns 0 - Success
2N/A * 1 - Master unable to join to set.
2N/A * 205 - Failure during RPC to another node
2N/A * -1 - Any other failure and ep is filled in.
2N/A * -1 return will eventually cause node to panic
2N/A * in a SunCluster environment.
2N/A */
2N/Aint
2N/Ameta_mnsync_diskset_mddbs(
2N/A mdsetname_t *sp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A mddb_config_t c;
2N/A md_mn_msgclass_t class;
2N/A mddb_setflags_config_t sf;
2N/A md_mnnode_desc *nd, *nd2;
2N/A md_error_t xep = mdnullerror;
2N/A int stale_set = 0;
2N/A md_drive_desc *dd;
2N/A
2N/A /* If setname is there, set desc should exist. */
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to get set %s desc information"), sp->setname);
2N/A return (-1);
2N/A }
2N/A
2N/A /* Are there drives in the set? */
2N/A if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2N/A ep)) == NULL) {
2N/A if (! mdisok(ep)) {
2N/A return (-1);
2N/A }
2N/A /* No drives in set -- nothing to sync up */
2N/A return (0);
2N/A }
2N/A
2N/A /*
2N/A * Is master node (which is this node) joined to set?
2N/A * If master node isn't joined (which means that no nodes
2N/A * are joined to diskset), remove the change log entries
2N/A * since no need to replay them - all nodes will have same
2N/A * view of mddbs since all nodes are reading in the mddbs
2N/A * from disk.
2N/A * There is also no need to sync up the master and ondisk mddbs
2N/A * since master has no incore knowledge.
2N/A * Need to join master to set in order to flush the change
2N/A * log entries. Don't need to block I/O during join of master
2N/A * to set since no other nodes are joined to set and so no I/O
2N/A * can be occurring.
2N/A */
2N/A if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
2N/A /* Join master to set */
2N/A if (clnt_joinset(mynode(), sp,
2N/A MNSET_IN_RECONFIG, ep)) {
2N/A if (mdismddberror(ep, MDE_DB_STALE)) {
2N/A /*
2N/A * If STALE, print message and continue on.
2N/A * Don't do any writes or reads to mddbs
2N/A * so don't clear change log.
2N/A */
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Join of master node to STALE set %s"),
2N/A sp->setname);
2N/A stale_set = 1;
2N/A mdclrerror(ep);
2N/A } else if (mdismddberror(ep, MDE_DB_ACCOK)) {
2N/A /* ACCOK means mediator provided extra vote */
2N/A mdclrerror(ep);
2N/A } else {
2N/A /*
2N/A * If master is unable to join set, print an
2N/A * error message. Don't return failure or node
2N/A * will panic during cluster reconfig cycle.
2N/A * Also, withdraw node from set in order to
2N/A * cleanup from failed join attempt.
2N/A */
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Join of master node in set %s failed"),
2N/A sp->setname);
2N/A if (clnt_withdrawset(mynode(), sp, &xep))
2N/A mdclrerror(&xep);
2N/A return (1);
2N/A }
2N/A }
2N/A /*
2N/A * Master node successfully joined.
2N/A * Set local copy of flags to OWN and
2N/A * send owner flag to rpc.metad. If not stale,
2N/A * flush the change log.
2N/A */
2N/A sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2N/A if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
2N/A MNSET_IN_RECONFIG, ep)) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Flag update of master node join in set %s failed"),
2N/A sp->setname);
2N/A return (-1);
2N/A }
2N/A
2N/A if (!stale_set) {
2N/A if (mdmn_reset_changelog(sp, ep,
2N/A MDMN_CLF_RESETLOG) != 0) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to reset changelog."));
2N/A return (-1);
2N/A }
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Removed changelog entries for set %s: %s"),
2N/A sp->setname,
2N/A meta_print_hrtime(gethrtime() - start_time));
2N/A }
2N/A /* Reset new master flag before return */
2N/A (void) memset(&sf, 0, sizeof (sf));
2N/A sf.sf_setno = sp->setno;
2N/A sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
2N/A sf.sf_flags = MDDB_NM_RESET;
2N/A /* Use magic to help protect ioctl against attack. */
2N/A sf.sf_magic = MDDB_SETFLAGS_MAGIC;
2N/A /* Ignore failure, failure to reset flag isn't catastrophic */
2N/A (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
2N/A &sf.sf_mde, NULL);
2N/A
2N/A /*
2N/A * The following call will solve a problem that has
2N/A * been present but harmless in the post S10 development
2N/A * release. The set creation times stored in the master
2N/A * blocks of MN sets all match except for the first one,
2N/A * which was set to the epoch. If the mbs don't all have
2N/A * matching set create times, metaimport reports bogus
2N/A * conflicts by matching on the epoch.
2N/A */
2N/A (void) meta_update_mb(sp, dd, ep);
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Reset new master flag for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A return (0);
2N/A }
2N/A
2N/A /*
2N/A * Is master already joined to STALE set (< 50% mddbs avail)?
2N/A * If so, can make no config changes to mddbs so don't check or play
2N/A * changelog and don't sync master node to ondisk mddbs.
2N/A * To get out of the stale state all nodes must be withdrawn
2N/A * from set. Then as nodes are re-joined, all nodes will
2N/A * have same view of mddbs since all nodes are reading the
2N/A * mddbs from disk.
2N/A */
2N/A (void) memset(&c, 0, sizeof (c));
2N/A c.c_id = 0;
2N/A c.c_setno = sp->setno;
2N/A if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2N/A (void) mdstealerror(ep, &c.c_mde);
2N/A return (-1);
2N/A }
2N/A if (c.c_flags & MDDB_C_STALE) {
2N/A return (0);
2N/A }
2N/A
2N/A /*
2N/A * If this node is NOT a newly chosen master, then there's
2N/A * nothing else to do since the change log should be empty and
2N/A * the ondisk and incore mddbs are already consistent.
2N/A *
2N/A * A newly chosen master is a node that was not the master
2N/A * at the beginning of the reconfig cycle. If a node is a new
2N/A * master, then the new master state is reset after the ondisk
2N/A * and incore mddbs are consistent and the change log has
2N/A * been replayed.
2N/A */
2N/A (void) memset(&sf, 0, sizeof (sf));
2N/A sf.sf_setno = sp->setno;
2N/A sf.sf_flags = MDDB_NM_GET;
2N/A /* Use magic to help protect ioctl against attack. */
2N/A sf.sf_magic = MDDB_SETFLAGS_MAGIC;
2N/A if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
2N/A ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
2N/A return (0);
2N/A }
2N/A
2N/A /*
2N/A * Now, sync up incore master view to ondisk mddbs.
2N/A * This is needed in the case where a master node
2N/A * had made a change to the mddb, but this change
2N/A * may not have been relayed to the slaves yet.
2N/A * So, the new master needs to verify that the ondisk
2N/A * mddbs match what the new master has incore -
2N/A * if different, new master rewrites all of the mddbs.
2N/A * Then the new master will replay the changelog and the
2N/A * new master will then execute what the old master had
2N/A * done.
2N/A *
2N/A * Block all I/Os to disks in this diskset on all nodes in
2N/A * the diskset. This will allow the rewriting of the mddbs
2N/A * (if needed), to proceed in a timely manner.
2N/A *
2N/A * If block of I/Os fail, return a -1.
2N/A */
2N/A
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Skip non-alive and non-owner nodes */
2N/A if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
2N/A (!(nd->nd_flags & MD_MN_NODE_OWN))) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
2N/A MN_SUSP_IO, ep)) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to suspend I/O on node %s in set %s"),
2N/A nd->nd_nodename, sp->setname);
2N/A
2N/A /*
2N/A * Resume all other nodes that had been suspended.
2N/A * (Reconfig return step also resumes I/Os
2N/A * for all sets.)
2N/A */
2N/A nd2 = sd->sd_nodelist;
2N/A while (nd2) {
2N/A /* Stop when reaching failed node */
2N/A if (nd2->nd_nodeid == nd->nd_nodeid)
2N/A break;
2N/A /* Skip non-alive and non-owner nodes */
2N/A if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
2N/A (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
2N/A nd2 = nd2->nd_next;
2N/A continue;
2N/A }
2N/A (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
2N/A sp->setno, MN_RES_IO, &xep));
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * If an RPC failure on another node, return a 205.
2N/A * Otherwise, exit with failure.
2N/A */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A return (-1);
2N/A }
2N/A
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A (void) memset(&c, 0, sizeof (c));
2N/A c.c_id = 0;
2N/A c.c_setno = sp->setno;
2N/A /* Master can't sync up to ondisk mddbs? Kick it out of cluster */
2N/A if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
2N/A return (-1);
2N/A
2N/A /*
2N/A * Resume I/Os that were suspended above.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Skip non-alive and non-owner nodes */
2N/A if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
2N/A (!(nd->nd_flags & MD_MN_NODE_OWN))) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
2N/A MN_RES_IO, ep)) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to resume I/O on node %s in set %s"),
2N/A nd->nd_nodename, sp->setname);
2N/A
2N/A /*
2N/A * If an RPC failure then don't do any
2N/A * more RPC calls, since one timeout is enough
2N/A * to endure. If RPC failure to another node, return
2N/A * 205. If RPC failure to my node, return -1.
2N/A * If not an RPC failure, continue resuming the
2N/A * rest of the nodes and then return -1.
2N/A */
2N/A if (mdanyrpcerror(ep)) {
2N/A if (sd->sd_mn_mynode->nd_nodeid ==
2N/A nd->nd_nodeid) {
2N/A return (-1);
2N/A } else {
2N/A return (205);
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * If not an RPC error, continue resuming rest of
2N/A * nodes, ignoring any failures except for an
2N/A * RPC failure which constitutes an immediate exit.
2N/A * Start in middle of list with failing node.
2N/A */
2N/A nd2 = nd->nd_next;
2N/A while (nd2) {
2N/A /* Skip non-alive and non-owner nodes */
2N/A if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
2N/A (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
2N/A nd2 = nd2->nd_next;
2N/A continue;
2N/A }
2N/A (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
2N/A sp->setno, MN_RES_IO, &xep));
2N/A if (mdanyrpcerror(&xep)) {
2N/A return (-1);
2N/A }
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
2N/A "checking/writing the mddb for set %s: %s"), sp->setname,
2N/A meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /*
2N/A * Send (aka replay) all messages we find in the changelog.
2N/A * Flag the messages with
2N/A * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
2N/A * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
2N/A */
2N/A for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
2N/A mdmn_changelog_record_t *lr;
2N/A md_error_t xep = mdnullerror;
2N/A md_mn_result_t *resultp = NULL;
2N/A int ret;
2N/A
2N/A lr = mdmn_get_changelogrec(sp->setno, class);
2N/A if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
2N/A /* no entry for this class */
2N/A continue;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
2N/A "replaying message ID=(%d, 0x%llx-%d)\n"),
2N/A MSGID_ELEMS(lr->lr_msg.msg_msgid));
2N/A
2N/A ret = mdmn_send_message_with_msgid(
2N/A lr->lr_msg.msg_setno,
2N/A lr->lr_msg.msg_type,
2N/A lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
2N/A MD_MSGF_OVERRIDE_SUSPEND,
2N/A lr->lr_msg.msg_recipient,
2N/A lr->lr_msg.msg_event_data,
2N/A lr->lr_msg.msg_event_size,
2N/A &resultp,
2N/A &lr->lr_msg.msg_msgid,
2N/A &xep);
2N/A
2N/A meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
2N/A "mdmn_send_message returned %d\n"), ret);
2N/A
2N/A if (resultp)
2N/A free_result(resultp);
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Playing changelog completed for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A /*
2N/A * Now that new master has ondisk and incore mddbs in sync, reset
2N/A * this node's new master kernel flag (for this set). If this node
2N/A * re-enters another reconfig cycle before the completion of this
2N/A * reconfig cycle, this master node won't need to check if the ondisk
2N/A * and incore mddbs are in sync since this node won't be considered
2N/A * a new master (since this flag is being reset here in the middle of
2N/A * step2). This will save time during any subsequent reconfig
2N/A * cycles as long as this node continues to be master.
2N/A */
2N/A (void) memset(&sf, 0, sizeof (sf));
2N/A sf.sf_setno = sp->setno;
2N/A sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
2N/A sf.sf_flags = MDDB_NM_RESET;
2N/A /* Use magic to help protect ioctl against attack. */
2N/A sf.sf_magic = MDDB_SETFLAGS_MAGIC;
2N/A /* Ignore failure, since failure to reset flag isn't catastrophic */
2N/A (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Reset new master flag for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * meta_mnjoin_all will join all starting nodes in the diskset.
2N/A * A starting node is considered to be any node that is not
2N/A * an owner of the set but is a member of the cluster.
2N/A * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
2N/A *
2N/A * Caller is the Master node.
2N/A *
2N/A * Returns 0 - Success
2N/A * 205 - Failure during RPC to another node
2N/A * -1 - Any other failure and ep is filled in.
2N/A */
2N/Aint
2N/Ameta_mnjoin_all(
2N/A mdsetname_t *sp,
2N/A md_error_t *ep
2N/A)
2N/A{
2N/A md_set_desc *sd;
2N/A md_mnnode_desc *nd, *nd2;
2N/A int rval = 0;
2N/A int stale_flag = 0;
2N/A mddb_config_t c;
2N/A int susp_res_flag = 0;
2N/A md_error_t xep = mdnullerror;
2N/A
2N/A /* If setname is there, set desc should exist. */
2N/A if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to get set %s desc information"), sp->setname);
2N/A return (-1);
2N/A }
2N/A
2N/A /* Are there drives in the set? */
2N/A if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2N/A ep) == NULL) {
2N/A if (! mdisok(ep)) {
2N/A return (-1);
2N/A }
2N/A /* No drives in set -- nothing to join */
2N/A return (0);
2N/A }
2N/A
2N/A /*
2N/A * Is set currently stale?
2N/A */
2N/A (void) memset(&c, 0, sizeof (c));
2N/A c.c_id = 0;
2N/A c.c_setno = sp->setno;
2N/A /* Ignore failure since master node may not be joined yet */
2N/A (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
2N/A if (c.c_flags & MDDB_C_STALE) {
2N/A stale_flag = MNSET_IS_STALE;
2N/A }
2N/A
2N/A /*
2N/A * If any nodes are going to be joined to diskset, then
2N/A * suspend I/O to all disks in diskset so that nodes can join
2N/A * (read in mddbs) in a reasonable amount of time even under
2N/A * high I/O load. Don't need to do this if set is STALE since
2N/A * no I/O can be occurring to a STALE set.
2N/A */
2N/A if (stale_flag != MNSET_IS_STALE) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Found a node that will be joined to diskset */
2N/A if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
2N/A (!(nd->nd_flags & MD_MN_NODE_OWN))) {
2N/A /* Set flag that diskset should be suspended */
2N/A susp_res_flag = 1;
2N/A break;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A }
2N/A
2N/A if (susp_res_flag) {
2N/A /*
2N/A * Block all I/Os to disks in this diskset on all joined
2N/A * nodes in the diskset.
2N/A * If block of I/Os fails due to an RPC failure on another
2N/A * node, return 205; otherwise, return -1.
2N/A */
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /* Skip non-alive and non-owner nodes */
2N/A if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
2N/A (!(nd->nd_flags & MD_MN_NODE_OWN))) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
2N/A MN_SUSP_IO, ep)) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to suspend I/O on node %s"
2N/A " in set %s"), nd->nd_nodename,
2N/A sp->setname);
2N/A /*
2N/A * Resume other nodes that had been suspended.
2N/A * (Reconfig return step also resumes I/Os
2N/A * for all sets.)
2N/A */
2N/A nd2 = sd->sd_nodelist;
2N/A while (nd2) {
2N/A /* Stop when reaching failed node */
2N/A if (nd2->nd_nodeid == nd->nd_nodeid)
2N/A break;
2N/A /* Skip non-alive/non-owner nodes */
2N/A if ((!(nd2->nd_flags &
2N/A MD_MN_NODE_ALIVE)) ||
2N/A (!(nd2->nd_flags &
2N/A MD_MN_NODE_OWN))) {
2N/A nd2 = nd2->nd_next;
2N/A continue;
2N/A }
2N/A (void) (clnt_mn_susp_res_io(
2N/A nd2->nd_nodename, sp->setno,
2N/A MN_RES_IO, &xep));
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A
2N/A /*
2N/A * If the suspend failed due to an
2N/A * RPC failure on another node, return
2N/A * a 205.
2N/A * Otherwise, exit with failure.
2N/A * The return reconfig step will resume
2N/A * I/Os for all disksets.
2N/A */
2N/A if ((mdanyrpcerror(ep)) &&
2N/A (sd->sd_mn_mynode->nd_nodeid !=
2N/A nd->nd_nodeid)) {
2N/A return (205);
2N/A } else {
2N/A return (-1);
2N/A }
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A }
2N/A
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /*
2N/A * If a node is in the membership list but isn't joined
2N/A * to the set, try to join the node.
2N/A */
2N/A if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
2N/A (!(nd->nd_flags & MD_MN_NODE_OWN))) {
2N/A if (clnt_joinset(nd->nd_nodename, sp,
2N/A (MNSET_IN_RECONFIG | stale_flag), ep)) {
2N/A /*
2N/A * If RPC failure to another node
2N/A * then exit without attempting anything else.
2N/A * (Reconfig return step will resume I/Os
2N/A * for all sets.)
2N/A */
2N/A if (mdanyrpcerror(ep)) {
2N/A mde_perror(ep, "");
2N/A return (205);
2N/A }
2N/A /*
2N/A * STALE and ACCOK failures aren't true
2N/A * failures. STALE means that <50% mddbs
2N/A * are available. ACCOK means that the
2N/A * mediator provided the extra vote.
2N/A * If a true failure, then print messasge
2N/A * and withdraw node from set in order to
2N/A * cleanup from failed join attempt.
2N/A */
2N/A if ((!mdismddberror(ep, MDE_DB_STALE)) &&
2N/A (!mdismddberror(ep, MDE_DB_ACCOK))) {
2N/A mde_perror(ep,
2N/A "WARNING: Unable to join node %s "
2N/A "to set %s", nd->nd_nodename,
2N/A sp->setname);
2N/A mdclrerror(ep);
2N/A if (clnt_withdrawset(nd->nd_nodename,
2N/A sp, &xep))
2N/A mdclrerror(&xep);
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A }
2N/A /* Set owner flag even if STALE or ACCOK */
2N/A nd->nd_flags |= MD_MN_NODE_OWN;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A /*
2N/A * Resume I/Os if suspended above.
2N/A */
2N/A if (susp_res_flag) {
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A /*
2N/A * Skip non-alive and non-owner nodes
2N/A * (this list doesn't include any of
2N/A * the nodes that were joined).
2N/A */
2N/A if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
2N/A (!(nd->nd_flags & MD_MN_NODE_OWN))) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
2N/A MN_RES_IO, ep)) {
2N/A mde_perror(ep, dgettext(TEXT_DOMAIN,
2N/A "Unable to resume I/O on node %s"
2N/A " in set %s"), nd->nd_nodename,
2N/A sp->setname);
2N/A
2N/A /*
2N/A * If an RPC failure then don't do any
2N/A * more RPC calls, since one timeout is enough
2N/A * to endure. If RPC failure to another node,
2N/A * return 205. If RPC failure to my node,
2N/A * return -1.
2N/A * (Reconfig return step will resume I/Os
2N/A * for all sets.)
2N/A * If not an RPC failure, continue resuming the
2N/A * rest of the nodes and then return -1.
2N/A */
2N/A if (mdanyrpcerror(ep)) {
2N/A if (sd->sd_mn_mynode->nd_nodeid ==
2N/A nd->nd_nodeid) {
2N/A return (-1);
2N/A } else {
2N/A return (205);
2N/A }
2N/A }
2N/A
2N/A /*
2N/A * If not an RPC error, continue resuming rest
2N/A * of nodes, ignoring any failures except for
2N/A * an RPC failure which constitutes an
2N/A * immediate exit.
2N/A * Start in middle of list with failing node.
2N/A */
2N/A nd2 = nd->nd_next;
2N/A while (nd2) {
2N/A /* Skip non-owner nodes */
2N/A if ((!(nd2->nd_flags &
2N/A MD_MN_NODE_ALIVE)) ||
2N/A (!(nd2->nd_flags &
2N/A MD_MN_NODE_OWN))) {
2N/A nd2 = nd2->nd_next;
2N/A continue;
2N/A }
2N/A (void) (clnt_mn_susp_res_io(
2N/A nd2->nd_nodename, sp->setno,
2N/A MN_RES_IO, &xep));
2N/A if (mdanyrpcerror(&xep)) {
2N/A return (-1);
2N/A }
2N/A nd2 = nd2->nd_next;
2N/A }
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A }
2N/A
2N/A nd = sd->sd_nodelist;
2N/A while (nd) {
2N/A if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2N/A nd = nd->nd_next;
2N/A continue;
2N/A }
2N/A /*
2N/A * If 1 node fails - go ahead and update the rest except
2N/A * in the case of an RPC failure, fail immediately.
2N/A */
2N/A if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2N/A sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
2N/A /* RPC failure to another node */
2N/A if (mdanyrpcerror(ep)) {
2N/A return (205);
2N/A }
2N/A nd = nd->nd_next;
2N/A rval = -1;
2N/A continue;
2N/A }
2N/A nd = nd->nd_next;
2N/A }
2N/A
2N/A meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
2N/A "Join of all nodes completed for set %s: %s"),
2N/A sp->setname, meta_print_hrtime(gethrtime() - start_time));
2N/A
2N/A return (rval);
2N/A}