/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Metadevice diskset interfaces
*/
#include <meta.h>
#include <mdmn_changelog.h>
#include "meta_set_prv.h"
#include "meta_repartition.h"
static int
check_setnodes_againstdrivelist(
mdsetname_t *sp,
mddrivenamelist_t *dnlp,
md_error_t *ep
)
{
md_set_desc *sd;
mddrivenamelist_t *p;
int i;
md_mnnode_desc *nd;
if ((sd = metaget_setdesc(sp, ep)) == NULL)
return (-1);
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
while (nd) {
if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
nd = nd->nd_next;
continue;
}
for (p = dnlp; p != NULL; p = p->next)
if (checkdrive_onnode(sp, p->drivenamep,
nd->nd_nodename, ep))
return (-1);
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
for (p = dnlp; p != NULL; p = p->next)
if (checkdrive_onnode(sp, p->drivenamep,
sd->sd_nodes[i], ep))
return (-1);
}
}
return (0);
}
static int
drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
{
mddrivenamelist_t *dl1, *dl2;
mddrivename_t *dn1, *dn2;
for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
dn1 = dl1->drivenamep;
for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
dn2 = dl2->drivenamep;
if (strcmp(dn1->cname, dn2->cname) != 0)
continue;
return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
NULL, dn1->cname, sp->setname));
}
}
return (0);
}
static md_drive_desc *
metaget_drivedesc_fromdrivelist(
mdsetname_t *sp,
mddrivenamelist_t *dnlp,
uint_t flags,
md_error_t *ep
)
{
mddrivenamelist_t *p;
md_drive_desc *dd = NULL;
md_set_desc *sd;
if ((sd = metaget_setdesc(sp, ep)) == NULL)
return (NULL);
for (p = dnlp; p != NULL; p = p->next) {
(void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
sd->sd_ctime, sd->sd_genid, flags);
}
return (dd);
}
/*
* Exported Entry Points
*/
int
meta_make_sidenmlist(
mdsetname_t *sp,
mddrivename_t *dnp,
int import_flag, /* flags partial import */
md_im_drive_info_t *midp, /* import drive information */
md_error_t *ep
)
{
mdsidenames_t *sn, **sn_next;
mdname_t *np;
int done;
side_t sideno = MD_SIDEWILD;
uint_t rep_slice;
char *bname;
if (!import_flag) {
/*
* Normal (aka NOT partial import) code path.
*/
if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
return (-1);
}
dnp->side_names_key = MD_KEYWILD;
if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
return (-1);
bname = Strdup(np->bname);
} else {
/*
* When doing a partial import, we'll get the needed
* information from somewhere other than the system.
*/
dnp->side_names_key = MD_KEYWILD;
bname = Strdup(midp->mid_devname);
}
metaflushsidenames(dnp);
sn_next = &dnp->side_names;
/*CONSTCOND*/
while (1) {
sn = Zalloc(sizeof (*sn));
if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
&sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
if (import_flag) {
mdclrerror(ep);
sn->dname = Strdup(midp->mid_driver_name);
sn->mnum = midp->mid_mnum;
} else {
Free(sn);
Free(bname);
return (-1);
}
}
if (done == 0) {
Free(sn);
Free(bname);
return (0);
}
sn->sideno = sideno;
/* Add to the end of the linked list */
assert(*sn_next == NULL);
*sn_next = sn;
sn_next = &sn->next;
}
/*NOTREACHED*/
}
int
meta_set_adddrives(
mdsetname_t *sp,
mddrivenamelist_t *dnlp,
daddr_t dbsize,
int force_label,
md_error_t *ep
)
{
md_set_desc *sd;
md_drive_desc *dd = NULL, *curdd = NULL, *ddp;
int i;
mddrivenamelist_t *p;
mhd_mhiargs_t mhiargs;
int rval = 0;
md_timeval32_t now;
sigset_t oldsigs;
ulong_t genid;
ulong_t max_genid = 0;
md_setkey_t *cl_sk;
int rb_level = 0;
md_error_t xep = mdnullerror;
md_mnnode_desc *nd;
int suspendall_flag = 0;
int suspend1_flag = 0;
int lock_flag = 0;
int flush_set_onerr = 0;
md_replicalist_t *rlp = NULL, *rl;
if ((sd = metaget_setdesc(sp, ep)) == NULL)
return (-1);
/* Make sure we own the set */
if (meta_check_ownership(sp, ep) != 0)
return (-1);
/*
* The drive and node records are stored in the local mddbs of each
* node in the diskset. Each node's rpc.metad daemon reads in the set,
* drive and node records from that node's local mddb and caches them
* internally. Any process needing diskset information contacts its
* local rpc.metad to get this information. Since each node in the
* diskset is independently reading the set information from its local
* mddb, the set, drive and node records in the local mddbs must stay
* in-sync, so that all nodes have a consistent view of the diskset.
*
* For a multinode diskset, explicitly verify that all nodes in the
* diskset are ALIVE (i.e. are in the API membership list). Otherwise,
* fail this operation since all nodes must be ALIVE in order to add
* the new drive record to their local mddb. If a panic of this node
* leaves the local mddbs set, node and drive records out-of-sync, the
* reconfig cycle will fix the local mddbs and force them back into
* synchronization.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
while (nd) {
if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
sp->setno,
nd->nd_nodename, NULL, sp->setname);
return (-1);
}
nd = nd->nd_next;
}
}
if (drvsuniq(sp, dnlp, ep) == -1)
return (-1);
/*
* Lock the set on current set members.
* Set locking done much earlier for MN diskset than for traditional
* diskset since lock_set and SUSPEND are used to protect against
* other meta* commands running on the other nodes.
*/
if (MD_MNSET_DESC(sd)) {
/* Make sure we are blocking all signals */
if (procsigs(TRUE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
rval = -1;
goto out;
}
lock_flag = 1;
nd = nd->nd_next;
}
/*
* Lock out other meta* commands by suspending
* class 1 messages across the diskset.
*/
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename,
COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
MD_MSCF_NO_FLAGS, ep)) {
rval = -1;
goto out;
}
suspend1_flag = 1;
nd = nd->nd_next;
}
}
if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
rval = -1;
goto out;
}
for (p = dnlp; p != NULL; p = p->next) {
mdsetname_t *tmp;
if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
ep) == -1) {
rval = -1;
goto out;
}
if (tmp != NULL) {
(void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
tmp->setname, p->drivenamep->cname, sp->setname);
rval = -1;
goto out;
}
}
/* END CHECK CODE */
/*
* This is a separate loop (from above) so that we validate all the
* drives handed to us before we repartition any one drive.
*/
for (p = dnlp; p != NULL; p = p->next) {
if (meta_repartition_drive(sp,
p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
NULL, /* Don't return the VTOC. */
ep) != 0) {
rval = -1;
goto out;
}
/*
* Create the names for the drives we are adding per side.
*/
if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
ep) == -1) {
rval = -1;
goto out;
}
}
/*
* Get the list of drives descriptors that we are adding.
*/
dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
if (! mdisok(ep)) {
rval = -1;
goto out;
}
/*
* Get the set timeout information.
*/
(void) memset(&mhiargs, '\0', sizeof (mhiargs));
if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
rval = -1;
goto out;
}
/*
* Get timestamp and generation id for new records
*/
now = sd->sd_ctime;
genid = sd->sd_genid;
/* At this point, in case of error, set should be flushed. */
flush_set_onerr = 1;
/* Lock the set on current set members */
if (!(MD_MNSET_DESC(sd))) {
md_rb_sig_handling_on();
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
rval = -1;
goto out;
}
lock_flag = 1;
}
}
/*
* Get drive descriptors for the drives that are currently in the set.
*/
curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
if (! mdisok(ep))
goto rollback;
/*
* If first drive being added to set, set the mastership
* of the multinode diskset to be this node.
* Only set it on this node. If all goes well
* and there are no errors, the mastership of this node will be set
* on all nodes in user space and in the kernel.
*/
if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
if (clnt_mnsetmaster(mynode(), sp,
sd->sd_mn_mynode->nd_nodename,
sd->sd_mn_mynode->nd_nodeid, ep)) {
goto rollback;
}
/*
* Set this up in my local cache of the set desc so that
* the set descriptor won't have to be gotten again from
* rpc.metad. If it is flushed and gotten again, these
* values will be set in sr2setdesc.
*/
sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
(void) strcpy(sd->sd_mn_master_nodenm,
sd->sd_mn_mynode->nd_nodename);
sd->sd_mn_am_i_master = 1;
}
RB_TEST(1, "adddrives", ep)
RB_PREEMPT;
rb_level = 1; /* level 1 */
RB_TEST(2, "adddrives", ep)
/*
* Add the drive records for the drives that we are adding to
* each host in the set. Marks the drive as MD_DR_ADD.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
ep) == -1)
goto rollback;
RB_TEST(3, "adddrives", ep)
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
ep) == -1)
goto rollback;
RB_TEST(3, "adddrives", ep)
}
}
RB_TEST(4, "adddrives", ep)
RB_PREEMPT;
rb_level = 2; /* level 2 */
RB_TEST(5, "adddrives", ep)
/*
* Take ownership of the added drives.
*/
if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
goto rollback;
}
/*
* If this is not a MN set and the state flags do not indicate the
* presence of devids, update the set records on all nodes.
*/
if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) {
if (meta_update_mb(sp, dd, ep) == 0) {
mdclrerror(ep);
/* update the sr_flags on all hosts */
for (i = 0; i < MD_MAXSIDES; i++) {
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_upd_sr_flags(sd->sd_nodes[i],
sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
goto rollback;
}
}
}
RB_TEST(6, "adddrives", ep)
RB_PREEMPT;
rb_level = 3; /* level 3 */
RB_TEST(7, "adddrives", ep)
/*
* Balance the DB's according to the list of existing drives and the
* list of added drives.
*/
if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
goto rollback;
/*
* Slam a dummy master block on all the disks that we are adding
* that don't have replicas on them.
* Used by diskset import if the disksets are remotely replicated
*/
if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
uint_t rep_slice;
int fd = -1;
mdname_t *np = NULL;
char *drive_name;
drive_name = ddp->dd_dnp->cname;
for (rl = rlp; rl != NULL; rl = rl->rl_next) {
char *rep_name;
rep_name =
rl->rl_repp->r_namep->drivenamep->cname;
if (strcmp(drive_name, rep_name) == 0) {
/*
* Disk has a replica on it so don't
* add dummy master block.
*/
break;
}
}
if (rl == NULL) {
/*
* Drive doesn't have a replica on it so
* we need a dummy master block. Add it.
*/
if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
&xep) != 0) {
mdclrerror(&xep);
continue;
}
if ((np = metaslicename(ddp->dd_dnp, rep_slice,
&xep)) == NULL) {
mdclrerror(&xep);
continue;
}
if ((fd = open(np->rname, O_RDWR)) >= 0) {
meta_mkdummymaster(sp, fd, 16);
(void) close(fd);
}
}
}
}
if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Start by suspending rpc.mdcommd (which drains it of all
* messages), then change the nodelist followed by a reinit
* and resume.
*/
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
rval = -1;
goto out;
}
suspendall_flag = 1;
nd = nd->nd_next;
}
}
/*
* If a MN diskset and this is the first disk(s) being added
* to set, then pre-allocate change log records here.
* When the other nodes are joined into the MN diskset, the
* USER records will just be snarfed in.
*/
if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
if (mdmn_allocate_changelog(sp, ep) != 0)
goto rollback;
}
/*
* Mark the drives MD_DR_OK.
* If first drive being added to MN diskset, then set
* master on all nodes to be this node and then join
* all alive nodes (nodes in membership list) to set.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/* don't set master on this node - done earlier */
if ((curdd == NULL) && (nd->nd_nodeid !=
sd->sd_mn_mynode->nd_nodeid)) {
/*
* Set master on all alive nodes since
* all alive nodes will become joined nodes.
*/
if (clnt_mnsetmaster(nd->nd_nodename, sp,
sd->sd_mn_mynode->nd_nodename,
sd->sd_mn_mynode->nd_nodeid, ep)) {
goto rollback;
}
}
if (curdd == NULL) {
/*
* No special flags for join set. Since
* all nodes are joining if 1st drive is being
* added to set then all nodes will be either
* STALE or non-STALE and each node can
* determine this on its own.
*/
if (clnt_joinset(nd->nd_nodename, sp,
NULL, ep)) {
goto rollback;
}
/* Sets join node flag on all nodes in list */
if (clnt_upd_nr_flags(nd->nd_nodename, sp,
sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
goto rollback;
}
}
/*
* Set MD_DR_OK as last thing before unlock.
* In case of panic on this node, recovery
* code can check for MD_DR_OK to determine
* status of diskset.
*/
if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
MD_DR_OK, ep) == -1)
goto rollback;
RB_TEST(8, "adddrives", ep)
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
ep) == -1)
goto rollback;
RB_TEST(8, "adddrives", ep)
}
}
RB_TEST(9, "adddrives", ep)
out:
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Send reinit command to mdcommd which forces it to get
* fresh set description.
*/
if (suspendall_flag) {
/* Send reinit */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/* Class is ignored for REINIT */
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
mde_perror(ep, dgettext(TEXT_DOMAIN,
"Unable to reinit rpc.mdcommd.\n"));
}
nd = nd->nd_next;
}
}
/*
* Unlock diskset by resuming messages across the diskset.
* Just resume all classes so that resume is the same whether
* just one class was locked or all classes were locked.
*/
if ((suspend1_flag) || (suspendall_flag)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
mde_perror(ep, dgettext(TEXT_DOMAIN,
"Unable to resume rpc.mdcommd.\n"));
}
nd = nd->nd_next;
}
meta_ping_mnset(sp->setno);
}
if (lock_flag) {
cl_sk = cl_get_setkey(sp->setno, sp->setname);
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_unlock_set(nd->nd_nodename,
cl_sk, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
}
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_unlock_set(sd->sd_nodes[i],
cl_sk, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
}
}
}
cl_set_setkey(NULL);
}
metafreedrivedesc(&dd);
if (flush_set_onerr) {
metaflushsetname(sp);
if (!(MD_MNSET_DESC(sd))) {
md_rb_sig_handling_off(md_got_sig(), md_which_sig());
}
}
if (MD_MNSET_DESC(sd)) {
/* release signals back to what they were on entry */
if (procsigs(FALSE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
}
return (rval);
rollback:
/* all signals already blocked for MN disket */
if (!(MD_MNSET_DESC(sd))) {
/* Make sure we are blocking all signals */
if (procsigs(TRUE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
}
rval = -1;
max_genid = sd->sd_genid;
/* level 3 */
if (rb_level > 2) {
/*
* Since the add drive operation is failing, need
* to reset config back to the way it was
* before the add drive opration.
* If a MN diskset and this is the first drive being added,
* then reset master on all ALIVE nodes (which is all nodes)
* since the master would have not been set previously.
* Don't reset master on this node, since this
* is done later.
* This is ok to fail since next node to add first
* disk to diskset will also set the master on all nodes.
*
* Also, if this is the first drive being added,
* need to have each node withdraw itself from the set.
*/
if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/*
* Be careful with ordering in case of
* panic between the steps and the
* effect on recovery during reconfig.
*/
if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
mdclrerror(&xep);
/* Sets withdraw flag on all nodes in list */
if (clnt_upd_nr_flags(nd->nd_nodename, sp,
sd->sd_nodelist, MD_NR_WITHDRAW,
NULL, &xep)) {
mdclrerror(&xep);
}
/* Skip this node */
if (nd->nd_nodeid ==
sd->sd_mn_mynode->nd_nodeid) {
nd = nd->nd_next;
continue;
}
/* Reset master on all of the other nodes. */
if (clnt_mnsetmaster(nd->nd_nodename, sp,
"", MD_MN_INVALID_NID, &xep))
mdclrerror(&xep);
nd = nd->nd_next;
}
}
}
/*
* Send resume command to mdcommd. Don't send reinit command
* since nodelist should not have changed.
* If suspendall_flag is set, then user would have been adding
* first drives to set. Since this failed, there is certainly
* no reinit message to send to rpc.commd since no nodes will
* be joined to set at the end of this metaset command.
*/
if (suspendall_flag) {
/* Send resume */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/*
* Resume all classes but class 1 so that lock is held
* against meta* commands.
* To later resume class1, must issue a class0 resume.
*/
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
sp, MD_MSG_CLASS0,
MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
mde_perror(&xep, dgettext(TEXT_DOMAIN,
"Unable to resume rpc.mdcommd.\n"));
mdclrerror(&xep);
}
nd = nd->nd_next;
}
meta_ping_mnset(sp->setno);
}
/* level 3 */
if (rb_level > 2) {
mdnamelist_t *nlp;
mdname_t *np;
for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
uint_t rep_slice;
if ((meta_replicaslice(ddp->dd_dnp,
&rep_slice, &xep) != 0) ||
((np = metaslicename(ddp->dd_dnp, rep_slice,
&xep)) == NULL)) {
mdclrerror(&xep);
continue;
}
nlp = NULL;
(void) metanamelist_append(&nlp, np);
if (meta_db_detach(sp, nlp,
(MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
mdclrerror(&xep);
metafreenamelist(nlp);
}
/* Re-balance */
if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
mdclrerror(&xep);
/* Only if we are adding the first drive */
/* Handled MN diskset above. */
if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
if (clnt_stimeout(mynode(), sp, &defmhiargs,
&xep) == -1)
mdclrerror(&xep);
/* This is needed because of a corner case */
if (halt_set(sp, &xep))
mdclrerror(&xep);
}
max_genid++;
}
/* level 2 */
if (rb_level > 1) {
if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
if (rel_own_bydd(sp, dd, TRUE, &xep))
mdclrerror(&xep);
}
}
/* level 1 */
if (rb_level > 0) {
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_deldrvs(nd->nd_nodename, sp, dd,
&xep) == -1)
mdclrerror(&xep);
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
&xep) == -1)
mdclrerror(&xep);
}
}
max_genid += 2;
resync_genid(sp, sd, max_genid, 0, NULL);
}
if ((suspend1_flag) || (suspendall_flag)) {
/* Send resume */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/*
* Just resume all classes so that resume is the
* same whether just one class was locked or all
* classes were locked.
*/
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
mdclrerror(&xep);
}
nd = nd->nd_next;
}
meta_ping_mnset(sp->setno);
}
/* level 0 */
cl_sk = cl_get_setkey(sp->setno, sp->setname);
/* Don't test lock flag since guaranteed to be set if in rollback */
if (MD_MNSET_DESC(sd)) {
/*
* Since the add drive operation is failing, need
* to reset config back to the way it was
* before the add drive opration.
* If a MN diskset and this is the first drive being
* added, then reset master on this node since
* the master would have not been set previously.
* This is ok to fail since next node to add first
* disk to diskset will also set the master on all nodes.
*/
if (curdd == NULL) {
/* Reset master on mynode */
if (clnt_mnsetmaster(mynode(), sp, "",
MD_MN_INVALID_NID, &xep))
mdclrerror(&xep);
}
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
mdclrerror(&xep);
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
mdclrerror(&xep);
}
}
cl_set_setkey(NULL);
/* release signals back to what they were on entry */
if (procsigs(FALSE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
metafreedrivedesc(&dd);
if (flush_set_onerr) {
metaflushsetname(sp);
if (!(MD_MNSET_DESC(sd))) {
md_rb_sig_handling_off(md_got_sig(), md_which_sig());
}
}
return (rval);
}
/*
* Add drives routine used during import of a diskset.
*/
int
meta_imp_set_adddrives(
mdsetname_t *sp,
mddrivenamelist_t *dnlp,
md_im_set_desc_t *misp,
md_error_t *ep
)
{
md_set_desc *sd;
mddrivenamelist_t *p;
md_drive_desc *dd = NULL, *ddp;
int flush_set_onerr = 0;
md_timeval32_t now;
ulong_t genid;
mhd_mhiargs_t mhiargs;
md_im_replica_info_t *mirp;
md_im_drive_info_t *midp;
int rval = 0;
sigset_t oldsigs;
ulong_t max_genid = 0;
int rb_level = 0;
md_error_t xep = mdnullerror;
if ((sd = metaget_setdesc(sp, ep)) == NULL)
return (-1);
for (p = dnlp; p != NULL; p = p->next) {
int imp_flag = 0;
/*
* If we have a partial diskset, meta_make_sidenmlist will
* need information from midp to complete making the
* side name structure.
*/
if (misp->mis_partial) {
imp_flag = MDDB_C_IMPORT;
for (midp = misp->mis_drives; midp != NULL;
midp = midp->mid_next) {
if (midp->mid_dnp == p->drivenamep)
break;
}
if (midp == NULL) {
(void) mddserror(ep, MDE_DS_SETNOTIMP,
MD_SET_BAD, mynode(), NULL, sp->setname);
rval = -1;
goto out;
}
}
/*
* Create the names for the drives we are adding per side.
*/
if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
midp, ep) == -1) {
rval = -1;
goto out;
}
}
/*
* Get the list of drives descriptors that we are adding.
*/
dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
if (! mdisok(ep)) {
rval = -1;
goto out;
}
/*
* Get the set timeout information.
*/
(void) memset(&mhiargs, '\0', sizeof (mhiargs));
if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
rval = -1;
goto out;
}
/*
* Get timestamp and generation id for new records
*/
now = sd->sd_ctime;
genid = sd->sd_genid;
/* At this point, in case of error, set should be flushed. */
flush_set_onerr = 1;
rb_level = 1; /* level 1 */
for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
if (ddp->dd_dnp == midp->mid_dnp) {
/* same disk */
ddp->dd_dnp->devid =
devid_str_encode(midp->mid_devid,
midp->mid_minor_name);
ddp->dd_dbcnt = 0;
mirp = midp->mid_replicas;
if (mirp) {
ddp->dd_dbsize = mirp->mir_length;
for (; mirp != NULL;
mirp = mirp->mir_next) {
ddp->dd_dbcnt++;
}
}
if ((midp->mid_available &
MD_IM_DISK_NOT_AVAILABLE) &&
(misp->mis_flags & MD_IM_SET_REPLICATED)) {
ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
}
}
}
}
/*
* Add the drive records for the drives that we are adding to
* each host in the set. Marks the drive records as MD_DR_ADD.
* May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
* this flag was set in the dd_flags for that drive.
*/
if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
goto rollback;
rb_level = 2; /* level 2 */
/*
* Take ownership of the added drives.
*/
if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
goto rollback;
out:
metafreedrivedesc(&dd);
if (flush_set_onerr) {
metaflushsetname(sp);
}
return (rval);
rollback:
/* Make sure we are blocking all signals */
if (procsigs(TRUE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
rval = -1;
max_genid = sd->sd_genid;
/* level 2 */
if (rb_level > 1) {
if (!MD_ATSET_DESC(sd)) {
if (rel_own_bydd(sp, dd, TRUE, &xep)) {
mdclrerror(&xep);
}
}
}
/* level 1 */
if (rb_level > 0) {
if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
mdclrerror(&xep);
}
max_genid += 2;
resync_genid(sp, sd, max_genid, 0, NULL);
}
/* level 0 */
/* release signals back to what they were on entry */
if (procsigs(FALSE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
metafreedrivedesc(&dd);
if (flush_set_onerr) {
metaflushsetname(sp);
md_rb_sig_handling_off(md_got_sig(), md_which_sig());
}
return (rval);
}
int
meta_set_deletedrives(
mdsetname_t *sp,
mddrivenamelist_t *dnlp,
int forceflg,
md_error_t *ep
)
{
md_set_desc *sd;
md_drive_desc *ddp, *dd = NULL, *curdd = NULL;
md_replicalist_t *rlp = NULL, *rl;
mddrivenamelist_t *p;
int deldrvcnt = 0;
int rval = 0;
mhd_mhiargs_t mhiargs;
int i;
sigset_t oldsigs;
md_setkey_t *cl_sk;
ulong_t max_genid = 0;
int rb_level = 0;
md_error_t xep = mdnullerror;
md_mnnode_desc *nd;
int has_set;
int current_drv_cnt = 0;
int suspendall_flag = 0, suspendall_flag_rb = 0;
int suspend1_flag = 0;
int lock_flag = 0;
bool_t stale_bool = FALSE;
int flush_set_onerr = 0;
mdnamelist_t *nlp;
mdname_t *np;
if ((sd = metaget_setdesc(sp, ep)) == NULL)
return (-1);
/* Make sure we own the set */
if (meta_check_ownership(sp, ep) != 0)
return (-1);
if (drvsuniq(sp, dnlp, ep) == -1)
return (-1);
/*
* Check and see if all the nodes have the set.
*
* The drive and node records are stored in the local mddbs of each
* node in the diskset. Each node's rpc.metad daemon reads in the set,
* drive and node records from that node's local mddb and caches them
* internally. Any process needing diskset information contacts its
* local rpc.metad to get this information. Since each node in the
* diskset is independently reading the set information from its local
* mddb, the set, drive and node records in the local mddbs must stay
* in-sync, so that all nodes have a consistent view of the diskset.
*
* For a multinode diskset, explicitly verify that all nodes in the
* diskset are ALIVE (i.e. are in the API membership list). Otherwise,
* fail this operation since all nodes must be ALIVE in order to delete
* a drive record from their local mddb. If a panic of this node
* leaves the local mddbs set, node and drive records out-of-sync, the
* reconfig cycle will fix the local mddbs and force them back into
* synchronization.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
while (nd) {
if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
sp->setno,
nd->nd_nodename, NULL, sp->setname);
return (-1);
}
nd = nd->nd_next;
}
/* Make sure we are blocking all signals */
if (procsigs(TRUE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
/*
* Lock the set on current set members.
* Set locking done much earlier for MN diskset than for
* traditional diskset since lock_set and SUSPEND are used
* to protect against other meta* commands running on the
* other nodes.
*/
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
rval = -1;
goto out;
}
lock_flag = 1;
nd = nd->nd_next;
}
/*
* Lock out other meta* commands by suspending
* class 1 messages across the diskset.
*/
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename,
COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
MD_MSCF_NO_FLAGS, ep)) {
rval = -1;
goto out;
}
suspend1_flag = 1;
nd = nd->nd_next;
}
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (strcmp(nd->nd_nodename, mynode()) == 0) {
nd = nd->nd_next;
continue;
}
has_set = nodehasset(sp, nd->nd_nodename,
NHS_NSTG_EQ, ep);
if (has_set < 0) {
rval = -1;
goto out;
}
if (! has_set) {
(void) mddserror(ep, MDE_DS_NODENOSET,
sp->setno, nd->nd_nodename,
NULL, sp->setname);
rval = -1;
goto out;
}
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (strcmp(sd->sd_nodes[i], mynode()) == 0)
continue;
has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
ep);
if (has_set < 0) {
/*
* Can directly return since !MN diskset;
* nothing to unlock.
*/
return (-1);
}
if (! has_set) {
/*
* Can directly return since !MN diskset;
* nothing to unlock.
*/
return (mddserror(ep, MDE_DS_NODENOSET,
sp->setno, sd->sd_nodes[i], NULL,
sp->setname));
}
}
}
for (p = dnlp; p != NULL; p = p->next) {
int is_it;
mddrivename_t *dnp;
dnp = p->drivenamep;
if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
== -1) {
rval = -1;
goto out;
}
if (! is_it) {
(void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
NULL, dnp->cname, sp->setname);
rval = -1;
goto out;
}
if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
rval = -1;
goto out;
}
deldrvcnt++;
}
current_drv_cnt = deldrvcnt;
/*
* Get drive descriptors for the drives that are currently in the set.
*/
curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
if (! mdisok(ep)) {
rval = -1;
goto out;
}
/*
* Decrement the the delete drive count for each drive currently in the
* set.
*/
for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
deldrvcnt--;
/*
* If the count of drives we are deleting is equal to the drives in the
* set, and we haven't specified forceflg, return an error
*/
if (deldrvcnt == 0 && forceflg == FALSE) {
(void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
rval = -1;
goto out;
}
/*
* Get the list of drive descriptors that we are deleting.
*/
dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
if (! mdisok(ep)) {
rval = -1;
goto out;
}
/*
* Get the set timeout information in case we have to roll back.
*/
(void) memset(&mhiargs, '\0', sizeof (mhiargs));
if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
rval = -1;
goto out;
}
/* At this point, in case of error, set should be flushed. */
flush_set_onerr = 1;
/* END CHECK CODE */
/* Lock the set on current set members */
if (!(MD_MNSET_DESC(sd))) {
md_rb_sig_handling_on();
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
rval = -1;
goto out;
}
lock_flag = 1;
}
}
if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
mddb_config_t c;
/*
* Is current set STALE?
*/
(void) memset(&c, 0, sizeof (c));
c.c_id = 0;
c.c_setno = sp->setno;
if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
(void) mdstealerror(ep, &c.c_mde);
rval = -1;
goto out;
}
if (c.c_flags & MDDB_C_STALE) {
stale_bool = TRUE;
}
}
RB_TEST(1, "deletedrives", ep)
RB_PREEMPT;
rb_level = 1; /* level 1 */
RB_TEST(2, "deletedrives", ep)
/*
* Mark the drives MD_DR_DEL
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
MD_DR_DEL, ep) == -1)
goto rollback;
RB_TEST(3, "deletedrives", ep)
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
MD_DR_DEL, ep) == -1)
goto rollback;
RB_TEST(3, "deletedrives", ep)
}
}
RB_TEST(4, "deletedrives", ep)
RB_PREEMPT;
rb_level = 2; /* level 2 */
RB_TEST(5, "deletedrives", ep)
/*
* Balance the DB's according to the list of existing drives and the
* list of deleted drives.
*/
if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
goto rollback;
/*
* If the drive(s) to be deleted cannot be accessed,
* they haven't really been deleted yet. Check and delete now
* if need be.
*/
if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
nlp = NULL;
for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
char *delete_name;
delete_name = ddp->dd_dnp->cname;
for (rl = rlp; rl != NULL; rl = rl->rl_next) {
char *cur_name;
cur_name =
rl->rl_repp->r_namep->drivenamep->cname;
if (strcmp(delete_name, cur_name) == 0) {
/* put it on the delete list */
np = rl->rl_repp->r_namep;
(void) metanamelist_append(&nlp, np);
}
}
}
if (nlp != NULL) {
if (meta_db_detach(sp, nlp,
(MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
ep) == -1) {
metafreenamelist(nlp);
goto rollback;
}
metafreenamelist(nlp);
}
}
RB_TEST(6, "deletedrives", ep)
RB_PREEMPT;
rb_level = 3; /* level 3 */
RB_TEST(7, "deletedrives", ep)
/*
* Cannot suspend set until after meta_db_balance since
* meta_db_balance uses META_DB_ATTACH/DETACH messages.
*/
if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Start by suspending rpc.mdcommd (which drains it of all
* messages), then change the nodelist followed by a reinit
* and resume.
*/
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
rval = -1;
goto out;
}
suspendall_flag = 1;
nd = nd->nd_next;
}
}
/*
* Remove the drive records for the drives that were deleted from
* each host in the set. This removes the record and dr_flags.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
goto rollback;
RB_TEST(8, "deletedrives", ep)
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
goto rollback;
RB_TEST(8, "deletedrives", ep)
}
}
RB_TEST(9, "deletedrives", ep)
RB_PREEMPT;
rb_level = 4; /* level 4 */
RB_TEST(10, "deletedrives", ep)
if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
if (rel_own_bydd(sp, dd, TRUE, ep))
goto rollback;
}
/* If we deleted all the drives, then we need to halt the set. */
if (deldrvcnt == 0) {
RB_TEST(11, "deletedrives", ep)
RB_PREEMPT;
rb_level = 5; /* level 5 */
RB_TEST(12, "deletedrives", ep)
if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
goto rollback;
RB_TEST(13, "deletedrives", ep)
RB_PREEMPT;
rb_level = 6; /* level 6 */
RB_TEST(14, "deletedrives", ep)
/* Halt MN diskset on all nodes by having node withdraw */
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/* Only withdraw nodes that are joined */
if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
nd = nd->nd_next;
continue;
}
/*
* Going to set locally cached node flags to
* rollback join so in case of error, the
* rollback code knows which nodes to re-join.
*/
nd->nd_flags |= MD_MN_NODE_RB_JOIN;
/*
* Be careful in ordering of following steps
* so that recovery from a panic between
* the steps is viable.
* Only reset master info in rpc.metad -
* don't reset local cached information
* which will be used to set master information
* back in case of failure (rollback).
*/
if (clnt_withdrawset(nd->nd_nodename, sp, ep))
goto rollback;
/* Sets withdraw flag on all nodes in list */
if (clnt_upd_nr_flags(nd->nd_nodename, sp,
sd->sd_nodelist, MD_NR_WITHDRAW,
NULL, ep)) {
goto rollback;
}
if (clnt_mnsetmaster(nd->nd_nodename, sp,
"", MD_MN_INVALID_NID, ep)) {
goto rollback;
}
nd = nd->nd_next;
}
} else {
if (halt_set(sp, ep))
goto rollback;
}
RB_TEST(15, "deletedrives", ep)
}
RB_TEST(16, "deletedrives", ep)
out:
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Send reinit command to mdcommd which forces it to get
* fresh set description.
*/
if (suspendall_flag) {
/* Send reinit */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/* Class is ignored for REINIT */
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
mde_perror(ep, dgettext(TEXT_DOMAIN,
"Unable to reinit rpc.mdcommd.\n"));
}
nd = nd->nd_next;
}
}
/*
* Just resume all classes so that resume is the same whether
* just one class was locked or all classes were locked.
*/
if ((suspend1_flag) || (suspendall_flag)) {
/* Send resume */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
mde_perror(ep, dgettext(TEXT_DOMAIN,
"Unable to resume rpc.mdcommd.\n"));
}
nd = nd->nd_next;
}
meta_ping_mnset(sp->setno);
}
if (lock_flag) {
cl_sk = cl_get_setkey(sp->setno, sp->setname);
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_unlock_set(nd->nd_nodename,
cl_sk, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
}
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_unlock_set(sd->sd_nodes[i],
cl_sk, &xep)) {
if (rval == 0)
(void) mdstealerror(ep, &xep);
rval = -1;
}
}
}
cl_set_setkey(NULL);
}
metafreedrivedesc(&dd);
if (flush_set_onerr) {
metaflushsetname(sp);
if (!(MD_MNSET_DESC(sd))) {
md_rb_sig_handling_off(md_got_sig(), md_which_sig());
}
}
if (MD_MNSET_DESC(sd)) {
/* release signals back to what they were on entry */
if (procsigs(FALSE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
}
return (rval);
rollback:
/* all signals already blocked for MN disket */
if (!(MD_MNSET_DESC(sd))) {
/* Make sure we are blocking all signals */
if (procsigs(TRUE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
}
rval = -1;
max_genid = sd->sd_genid;
/* Set the master on all nodes first thing */
if (rb_level > 5) {
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
continue;
}
/*
* Set master on all re-joining nodes to be
* my cached view of master.
*/
if (clnt_mnsetmaster(nd->nd_nodename, sp,
sd->sd_mn_master_nodenm,
sd->sd_mn_master_nodeid, &xep)) {
mdclrerror(&xep);
}
}
}
}
/* level 3 */
if (rb_level > 2) {
md_set_record *sr;
md_mnset_record *mnsr;
md_drive_record *dr;
int sr_drive_cnt;
/*
* See if we have to re-add the drives specified.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/*
* Must get current set record from each
* node to see what else must be done
* to recover.
* Record should be for a multi-node diskset.
*/
if (clnt_mngetset(nd->nd_nodename, sp->setname,
MD_SET_BAD, &mnsr, &xep) == -1) {
mdclrerror(&xep);
nd = nd->nd_next;
continue;
}
/*
* If all drives are already there, skip
* to next node.
*/
sr_drive_cnt = 0;
dr = mnsr->sr_drivechain;
while (dr) {
sr_drive_cnt++;
dr = dr->dr_next;
}
if (sr_drive_cnt == current_drv_cnt) {
free_sr((md_set_record *)mnsr);
nd = nd->nd_next;
continue;
}
/* Readd all drives */
if (clnt_adddrvs(nd->nd_nodename, sp, dd,
mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
mdclrerror(&xep);
free_sr((struct md_set_record *)mnsr);
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
/* Record should be for a non-multi-node set */
if (clnt_getset(sd->sd_nodes[i], sp->setname,
MD_SET_BAD, &sr, &xep) == -1) {
mdclrerror(&xep);
continue;
}
/*
* Set record structure was allocated from RPC
* routine getset so this structure is only of
* size md_set_record even if the MN flag is
* set. So, clear the flag so that the free
* code doesn't attempt to free a structure
* the size of md_mnset_record.
*/
if (MD_MNSET_REC(sr)) {
sr->sr_flags &= ~MD_SR_MN;
free_sr(sr);
continue;
}
/* Drive already added, skip to next node */
if (sr->sr_drivechain != NULL) {
free_sr(sr);
continue;
}
if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
sr->sr_ctime, sr->sr_genid, &xep) == -1)
mdclrerror(&xep);
free_sr(sr);
}
}
max_genid += 2;
}
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* At this point in time, don't know which nodes are joined
* to the set. So, send a reinit command to mdcommd
* which forces it to get fresh set description. Then send resume.
*
* Later, this code will use rpc.mdcommd messages to reattach disks
* and then rpc.mdcommd may be suspended again, rest of the nodes
* joined, rpc.mdcommd reinited and then resumed.
*/
if (suspendall_flag) {
/* Send reinit */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/* Class is ignored for REINIT */
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
mde_perror(&xep, dgettext(TEXT_DOMAIN,
"Unable to reinit rpc.mdcommd.\n"));
mdclrerror(&xep);
}
nd = nd->nd_next;
}
/* Send resume */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/*
* Resume all classes but class 1 so that lock is held
* against meta* commands.
* To later resume class1, must issue a class0 resume.
*/
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
sp, MD_MSG_CLASS0,
MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
mde_perror(&xep, dgettext(TEXT_DOMAIN,
"Unable to resume rpc.mdcommd.\n"));
mdclrerror(&xep);
}
nd = nd->nd_next;
}
meta_ping_mnset(sp->setno);
}
/* level 2 */
if (rb_level > 1) {
mdnamelist_t *nlp;
mdname_t *np;
for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
uint_t rep_slice;
if ((meta_replicaslice(ddp->dd_dnp,
&rep_slice, &xep) != 0) ||
((np = metaslicename(ddp->dd_dnp, rep_slice,
&xep)) == NULL)) {
mdclrerror(&xep);
continue;
}
nlp = NULL;
(void) metanamelist_append(&nlp, np);
if (meta_db_attach(sp, nlp,
(MDCHK_DRVINSET | MDCHK_SET_LOCKED),
&sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
NULL, &xep) == -1)
mdclrerror(&xep);
metafreenamelist(nlp);
}
/* Re-balance */
if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
mdclrerror(&xep);
}
/* level 4 */
if (rb_level > 3) {
if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
mdclrerror(&xep);
}
}
/* level 5 */
if (rb_level > 4) {
if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
mdclrerror(&xep);
}
/*
* If at least one node needs to be rejoined to MN diskset,
* then suspend commd again.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
nd = nd->nd_next;
continue;
}
break;
}
if (nd) {
/*
* Found node that will be rejoined so
* notify rpc.mdcommd on all nodes of a nodelist change.
* Start by suspending rpc.mdcommd (which drains it of
* all messages), then change the nodelist followed by
* a reinit and resume.
*/
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename,
COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
MD_MSCF_NO_FLAGS, &xep)) {
mdclrerror(&xep);
}
suspendall_flag_rb = 1;
nd = nd->nd_next;
}
}
}
/* level 6 */
if (rb_level > 5) {
if (MD_MNSET_DESC(sd)) {
int join_flags = 0;
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/* Only rejoin nodes that were joined before */
if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
nd = nd->nd_next;
continue;
}
/*
* Rejoin nodes to same state as before -
* either STALE or non-STALE.
*/
if (stale_bool == TRUE)
join_flags = MNSET_IS_STALE;
if (clnt_joinset(nd->nd_nodename, sp,
join_flags, &xep))
mdclrerror(&xep);
/* Sets OWN flag on all nodes in list */
if (clnt_upd_nr_flags(nd->nd_nodename, sp,
sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
mdclrerror(&xep);
}
nd = nd->nd_next;
}
} else {
if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
mdclrerror(&xep);
/* No special flag for traditional diskset */
if (snarf_set(sp, NULL, &xep))
mdclrerror(&xep);
}
}
/* level 1 */
if (rb_level > 0) {
/*
* Mark the drives as OK.
*/
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/*
* Must be last action before unlock.
* In case of panic, recovery code checks
* for MD_DR_OK to know that drive
* and possible master are fully added back.
*/
if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
MD_DR_OK, &xep) == -1)
mdclrerror(&xep);
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
MD_DR_OK, &xep) == -1)
mdclrerror(&xep);
}
}
max_genid += 2;
resync_genid(sp, sd, max_genid, 0, NULL);
}
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Send a reinit command to mdcommd which forces it to get
* fresh set description.
*/
if (suspendall_flag_rb) {
/* Send reinit */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
/* Class is ignored for REINIT */
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
mde_perror(&xep, dgettext(TEXT_DOMAIN,
"Unable to reinit rpc.mdcommd.\n"));
mdclrerror(&xep);
}
nd = nd->nd_next;
}
}
/*
* Just resume all classes so that resume is the same whether
* just one class was locked or all classes were locked.
*/
if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
/* Send resume */
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
mde_perror(&xep, dgettext(TEXT_DOMAIN,
"Unable to resume rpc.mdcommd.\n"));
mdclrerror(&xep);
}
nd = nd->nd_next;
}
meta_ping_mnset(sp->setno);
}
/* level 0 */
cl_sk = cl_get_setkey(sp->setno, sp->setname);
/* Don't test lock flag since guaranteed to be set if in rollback */
if (MD_MNSET_DESC(sd)) {
nd = sd->sd_nodelist;
/* All nodes are guaranteed to be ALIVE */
while (nd) {
if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
mdclrerror(&xep);
nd = nd->nd_next;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
if (sd->sd_nodes[i][0] == '\0')
continue;
if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
mdclrerror(&xep);
}
}
cl_set_setkey(NULL);
/* release signals back to what they were on entry */
if (procsigs(FALSE, &oldsigs, &xep) < 0)
mdclrerror(&xep);
metafreedrivedesc(&dd);
if (flush_set_onerr) {
metaflushsetname(sp);
if (!(MD_MNSET_DESC(sd))) {
md_rb_sig_handling_off(md_got_sig(), md_which_sig());
}
}
return (rval);
}