meta_set.c revision d7cd82522afdd890a66c7600b499590ad44e84bd
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Just in case we're not in a build environment, make sure that
* TEXT_DOMAIN gets set to something.
*/
#if !defined(TEXT_DOMAIN)
#define TEXT_DOMAIN "SYS_TEST"
#endif
/*
* Metadevice diskset interfaces
*/
#include "meta_set_prv.h"
#include <meta.h>
#include <metad.h>
#include <mdmn_changelog.h>
#include <sdssc.h>
extern char *blkname(char *);
static md_drive_desc *
int flags,
)
{
if (flags & MD_BYPASS_DAEMON) {
return (NULL);
} else {
return (NULL);
}
/*
* WARNING:
* The act of getting the dnp from the namespace means that we
* will get the devid of the disk as recorded in the namespace.
* This devid has the potential to be stale if the disk is being
* replaced via a rebind, this means that any code that relies
* on any of the dnp information should take the appropriate action
* to preserve that information. For example in the rebind code the
* devid of the new disk is saved off and then copied back in once
* the code that has called this function has completed.
*/
if (!(flags & MD_BYPASS_DAEMON))
return (NULL);
}
}
if (!(flags & MD_BYPASS_DAEMON)) {
}
return (dd_head);
}
static int
)
{
int i;
return (-1);
if (MD_MNSET_DESC(sd)) {
/*
* Only get sidenames for this node since
* that is the only side information stored in
* the local mddb for a multi-node diskset.
*/
if (sd->sd_mn_mynode) {
return (-1);
}
/* Add to the end of the linked list */
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
/*
* It is possible that during the add of a
* host to have a 'missing' side as the side
* for this disk will be added later. So ignore
* the error. The 'missing' side will be added
* once the addhosts process has completed.
*/
mdclrerror(ep);
continue;
}
return (-1);
}
/* Add to the end of the linked list */
}
}
return (0);
}
static md_drive_desc *
)
{
md_replica_t *r;
md_drive_desc *d;
int found;
return (NULL);
/* find the smallest existing replica */
}
if (nblks <= 0)
found = 0;
found = 1;
break;
}
}
if (! found)
}
return (dd);
}
/*
* Exported Entry Points
*/
{
if (max_sets == 0)
return (0);
return (max_sets);
}
int
{
static int max_meds = 0;
if (max_meds == 0)
return (0);
return (max_meds);
}
{
return (0);
return (MD_SIDEWILD);
if (sideno != MD_SIDEWILD)
return (sideno);
}
/*
* get set info from name
*/
{
char *p;
/* get set info from daemon */
return (NULL);
/*
* Returned record could be for a multi-node set or a
* non-multi-node set.
*/
if (MD_MNSET_REC(sr)) {
/*
* Record is for a multi-node set. Reissue call
* to get mnset information. Need to free
* record as if a non-multi-node set record since
* that is what clnt_getset gave us. If in
* the daemon, don't free since this is a pointer
* into the setrecords array.
*/
if (! md_in_daemon) {
}
ep) == -1)
return (NULL);
return ((struct md_set_record *)mnsr);
} else {
return (sr);
}
}
/* no such set */
Free(p);
return (NULL);
}
/*
* get set info from number
*/
{
char buf[100];
return (NULL);
/*
* Record is for a multi-node set. Reissue call
* to get mnset information. Need to free
* record as if a non-multi-node set record since
* that is what clnt_getset gave us. If in
* the daemon, don't free since this is a pointer
* into the setrecords array.
*/
if (MD_MNSET_REC(sr)) {
/*
* Record is for a multi-node set. Reissue call
* to get mnset information.
*/
if (! md_in_daemon) {
}
ep) == -1)
return (NULL);
return ((struct md_set_record *)mnsr);
} else {
return (sr);
}
}
return (NULL);
}
int
int check_db,
)
{
mdnamelist_t *p;
int rval = 0;
/* get all underlying partitions */
return (-1);
/* search for drive */
break;
}
}
/* cleanup, return success */
return (rval);
}
/*
* simple check for ownership
*/
int
{
int ownset;
if (metaislocalset(sp))
return (0);
return (-1);
return (-1);
return (-1);
/* If we have no drive descriptors, check for no ownership */
if (ownset == MD_SETOWNER_NONE)
return (0);
/* If ownership somehow has come to exist, we must clean up */
&xep) < 0)
mdclrerror(&xep);
mdclrerror(&xep);
mdclrerror(&xep);
}
mdclrerror(&xep);
return (0);
}
if (ownset == MD_SETOWNER_YES)
return (0);
}
/*
* simple check for ownership
*/
int
{
int bool;
if (metaislocalset(sp))
return (0);
return (-1);
return (-1);
return (-1);
return (0);
if (bool == TRUE)
return (0);
}
/*
* Function that determines if a node is in the multinode diskset
* membership list. Calling node passes in node to be checked and
* the nodelist as returned from meta_read_nodelist. This routine
* anticipates being called many times using the same diskset membership
* list which is why the alloc and free of the diskset membership list
* is left to the calling routine.
* Returns:
* 1 - if a member
* 0 - not a member
*/
int
char *node_name,
)
{
int flag_check_name;
if (node_id != 0)
flag_check_name = 0;
flag_check_name = 1;
else
return (0);
while (nl2) {
if (flag_check_name) {
/* Compare given name against name in member list */
break;
} else {
/* Compare given nodeid against nodeid in member list */
break;
}
}
/* No match found in member list */
return (0);
}
/* Return 1 if node is in member list */
return (1);
}
/*
* meta_getnext_devinfo should go to the host that
* has the device, to return the device name, driver name, minor num.
* We can take the big cheat for now, since it is a requirement
* that the device names and device numbers are the same, and
* just get the info locally.
*
* This routine is very similar to meta_getnextside_devinfo except
* that the specific side to be used is being passed in.
*
* Exit status:
* 0 - No more side info to return
* 1 - More side info's to return
* -1 - An error has been detected
*/
/*ARGSUSED*/
int
char *bname, /* local block name (myside) */
char **ret_bname, /* block device name of returned side */
char **ret_dname, /* driver name of returned side */
)
{
return (-1);
/*
* NOTE (future) - There will be more work here once devids are integrated
* into disksets. Then the side should be used to find the correct
* host and the b/d names should be gotten from that host.
*/
/*
* Return the side info.
*/
return (-1);
}
return (1);
}
/*
* Get the information on the device from the remote node using the devid
* of the disk.
*
* Exit status:
* 0 - No more side info to return
* 1 - More side info's to return
* -1 - An error has been detected
*/
int
char *bname, /* local block name (myside) */
char **ret_bname, /* block device name of returned side */
char **ret_dname, /* driver name of returned side */
)
{
int i;
int devidstrlen;
char *ret_devname = NULL;
char *ret_blkdevname = NULL;
char *ret_driver = NULL;
char *nodename;
int fd;
int ret = -1;
char *minor_name = NULL;
if (metaislocalset(sp)) {
/* no more sides - we are done */
if (*sideno != MD_SIDEWILD)
return (0);
/* First time through - set up return sideno */
*sideno = 0;
} else {
/*
* Find the next sideno, starting after the one given.
*/
return (-1);
if (MD_MNSET_DESC(sd)) {
if ((*sideno == MD_SIDEWILD) &&
} else {
while (nd) {
/*
* Found given sideno, now find
* next sideno, if there is one.
*/
(struct md_mnnode_desc *)NULL)) {
*sideno =
break;
}
}
return (0);
}
}
if (*sideno == MD_SIDEWILD)
return (0);
} else {
/* Find next full slot */
break;
/* No more sides - we are done */
if (i == MD_MAXSIDES)
return (0);
/* Set up the return sideno */
*sideno = i;
}
}
/*
* Need to pass the node the devid of the disk and get it to
* send back the details of the disk from that side.
*/
return (-1);
/*
* By default, set up the parameters so that they are copied out.
*/
return (-1);
}
/*
* Try some optimization. If this is the local set or the device
* is a metadevice then just copy the information. If the device
* does not have a devid (due to not having a minor name) then
* fall back to the pre-devid behaviour of copying the information
* on the device: this is okay because the sanity checks before this
* call would have found any issues with the device. If it's a
* multi-node diskset also just return ie. copy.
*/
(MD_MNSET_DESC(sd)))
return (1);
/*
* Have to get the minor name then. The slice should exist
* on the disk because it will have already been repartitioned
* up prior to getting to this point.
*/
return (-1);
}
}
/* allocate extra space for "/" and NULL hence +2 */
/*
* As a minor name is supplied then the ret_devname will be
* appropriate to that minor_name and in this case it will be
*/
/*
* If the other side is not running device id in disksets,
* 'ret' is set to ENOTSUP in which case we fallback to
* the existing behaviour
*/
return (1);
else if (ret == -1)
return (-1);
/*
* ret_devname comes from the rpc call and is a
* raw device name. We need to make this into a
* block device via blkname for further processing.
* Unfortunately, when our device id isn't found in
* the system, the rpc call will return a " " in
* ret_devname in which case we need to fill that in
* as ret_blkname because blkname of " " returns NULL.
*/
if (ret_blkdevname == NULL)
else
}
return (1);
}
int
mdsetname_t **spp,
int bypass_daemon,
)
{
int is_it;
return (-1);
if (!bypass_daemon) {
mdclrerror(ep);
return (0);
}
mdclrerror(ep);
continue;
}
return (-1);
}
} else
mdclrerror(ep);
continue;
}
return (-1);
}
if (is_it) {
return (0);
}
}
return (0);
}
int
int bypass_daemon,
)
{
md_drive_desc *dd, *p;
if (bypass_daemon)
else
return (-1);
return (0);
}
return (1);
return (0);
}
int
)
{
int i;
int rval = 0;
int suspend1_flag = 0;
return (-1);
/* Make sure we own the set */
return (-1);
/* END CHECK CODE */
/*
* Get drive descriptors for the drives that are currently in the set.
*/
return (-1);
/* Find the minimum replica size in use is or use the default */
mdclrerror(ep);
else
/* Make sure we are blocking all signals */
mdclrerror(&xep);
/*
* Lock the set on current set members.
* For MN diskset lock_set and SUSPEND are used to protect against
* other meta* commands running on the other nodes.
*/
if (MD_MNSET_DESC(sd)) {
while (nd) {
continue;
}
rval = -1;
goto out;
}
}
/*
* Lock out other meta* commands by suspending
* class 1 messages across the diskset.
*/
while (nd) {
continue;
}
MD_MSCF_NO_FLAGS, ep)) {
rval = -1;
goto out;
}
suspend1_flag = 1;
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
rval = -1;
goto out;
}
}
}
/* We are not adding or deleting any drives, just balancing */
/*
* Balance the DB's according to the list of existing drives and the
* list of added drives.
*/
goto out;
out:
/*
* Unlock diskset by resuming class 1 messages across the diskset.
* Just resume all classes so that resume is the same whether
* just one class was locked or all classes were locked.
*/
if (suspend1_flag) {
while (nd) {
continue;
}
/*
* We are here because we failed to resume
* rpc.mdcommd. However we potentially have
* an error from the previous call
* (meta_db_balance). If the previous call
* did fail, we capture that error and
* generate a perror withthe string,
* "Unable to resume...".
* Setting rval to -1 ensures that in the
* next iteration of the loop, ep is not
* clobbered.
*/
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
"Unable to resume rpc.mdcommd."));
}
}
}
/* Unlock the set */
if (MD_MNSET_DESC(sd)) {
while (nd) {
continue;
}
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
}
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
if (rval == 0)
rval = -1;
}
}
}
/* release signals back to what they were on entry */
mdclrerror(&xep);
return (rval);
}
int
int lock_set,
)
{
int i;
int num_users = 0;
int has_set;
int rval = 0;
int delete_end = 1;
/* Make sure we are blocking all signals */
return (-1);
rval = -1;
goto out;
}
/*
* meta_set_destroy should not be called for a MN diskset.
* This routine destroys a set without communicating this information
* to the other nodes which would lead to an inconsistency in
* the MN diskset.
*/
if (MD_MNSET_DESC(sd)) {
rval = -1;
goto out;
}
/* Continue if a traditional diskset */
/*
* Check to see who has the set. If we are not the last user of the
* set, we will not touch the replicas.
*/
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
ep);
if (has_set < 0) {
mdclrerror(ep);
} else
num_users++;
}
rval = -1;
goto out;
}
}
rval = -1;
goto out;
}
/* Lock the set on our side */
rval = -1;
goto out;
}
}
/*
* A traditional diskset has no diskset stale information to send
* since there can only be one owner node at a time.
*/
mdclrerror(ep);
/*
* Make sure that no drives are in use as parts of metadrives
* or hot spare pools, this is one of the few error conditions
* that will stop this routine, unless the environment has
* META_DESTROY_SET_OK set, in which case, the operation will
* proceed.
*/
if (i == -1) {
/* need xep - wire calls clear error */
if (i == -1) {
rval = -1;
goto out;
}
if (mysideno == MD_SIDEWILD) {
rval = -1;
goto out;
}
rval = -1;
goto out;
}
rval = -1;
goto out;
}
}
}
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
/* Skip non local nodes */
continue;
mdclrerror(ep);
}
/*
* Go thru each drive and individually delete the replicas.
* This way we can ignore individual errors.
*/
== NULL) &&
== NULL))) {
rval = -1;
goto out;
}
rval = -1;
goto out;
}
mdclrerror(ep);
}
/* Yes this is UGLY!!! */
mdclrerror(ep);
if (p->dd_dbcnt == 0)
continue;
/*
* Skip the replica removal if we are not the last user
*/
if (num_users != 1)
continue;
mdclrerror(ep);
}
}
rval = -1;
goto out;
}
/* Setup the mediator record */
medr.med_rec_fl = 0;
medr.med_rec_foff = 0;
/*
* If we are the last remaining user, then remove the mediator hosts
*/
if (num_users == 1) {
for (i = 0; i < MED_MAX_HOSTS; i++) {
sizeof (md_h_t));
}
} else { /* Remove this host from the mediator node list. */
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
/* Copy non local node */
continue;
}
/* Clear local node */
sizeof (md_node_nm_t));
}
}
/*
* If the client is part of a cluster put the DCS service
* into a deleteing state.
*/
delete_end = 0;
} else {
mdclrerror(ep);
goto out;
}
}
/* Inform the mediator hosts of the new information */
for (i = 0; i < MED_MAX_HOSTS; i++) {
continue;
mdclrerror(ep);
}
/* Delete the set locally */
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
/* Skip non local nodes */
continue;
mdclrerror(ep);
}
if (delete_end &&
rval = -1;
out:
/* release signals back to what they were on entry */
if (rval == 0)
rval = -1;
}
if (rval == 0)
rval = -1;
}
}
return (rval);
}
int
int bypass_cluster,
int forceflg,
)
{
int rval = 0;
int i, num_hosts = 0;
int has_set = 0;
int max_node = 0;
int delete_end = 1;
/* unable to find set description */
rval = 1;
return (rval);
}
if (MD_MNSET_DESC(sd)) {
/*
* Get a count of the hosts in the set and also lock the set
* on those hosts that know about it.
*/
while (nd) {
continue;
}
NHS_NST_EQ, ep);
/*
* The host is not aware of this set (has_set < 0) or
* the set does not match (has_set == 0). This check
* prevents the code getting confused by an apparent
* inconsistancy in the set's state, this is in the
* purge code so something is broken in any case and
* this is just trying to fix the brokeness.
*/
if (has_set <= 0) {
mdclrerror(ep);
} else {
num_hosts++;
/*
* If the force flag is set then
* ignore any RPC failures because we
* are only really interested with
* the set on local node.
*/
mdclrerror(ep);
} else {
/*
* set max_node so that in the
* unlock code nodes in the
* set that have not been
* locked are not unlocked.
*/
rval = 2;
goto out1;
}
}
}
}
max_node = 0;
} else {
/*
* Get a count of the hosts in the set and also lock the set
* on those hosts that know about it.
*/
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
NHS_NST_EQ, ep);
/*
* The host is not aware of this set (has_set < 0) or
* the set does not match (has_set == 0). This check
* prevents the code getting confused by an apparent
* inconsistancy in the set's state, this is in the
* purge code so something is broken in any case and
* this is just trying to fix the brokeness.
*/
if (has_set <= 0) {
mdclrerror(ep);
/*
* set the node to NULL to prevent further
* requests to this unresponsive node.
*/
} else {
num_hosts++;
/*
* If the force flag is set then
* ignore any RPC failures because we
* are only really interested with
* the set on local node.
*/
mdclrerror(ep);
} else {
rval = 2;
/*
* set max_node so that in the
* unlock code nodes in the
* set that have not been
* locked are not unlocked.
*/
max_node = i;
goto out1;
}
}
}
}
max_node = i; /* now MD_MAXSIDES */
}
if (!bypass_cluster) {
/*
* If there is only one host associated with the
* set then remove the set from the cluster.
*/
if (num_hosts == 1) {
delete_end = 0;
} else {
mdclrerror(ep);
rval = 3;
goto out1;
}
}
}
}
if (MD_MNSET_DESC(sd)) {
/*
* Get a count of the hosts in the set and also lock the set
* on those hosts that know about it.
*/
while (nd) {
continue;
}
/*
* Tell the remote node to remove this node
*/
/*
* If we fail to delete ourselves
* from the remote host it does not
* really matter because the set is
* being "purged" from this node. The
* set can be purged from the other
* node at a later time.
*/
mdclrerror(ep);
}
continue;
}
/* remove the set from this host */
mdclrerror(ep);
goto out1;
}
}
} else {
for (i = 0; i < MD_MAXSIDES; i++) {
/* Skip empty slots */
continue;
/*
* Tell the remote node to remove this node
*/
/*
* If we fail to delete ourselves
* from the remote host it does not
* really matter because the set is
* being "purged" from this node. The
* set can be purged from the other
* node at a later time.
*/
mdclrerror(ep);
}
continue;
}
/* remove the set from this host */
mdclrerror(ep);
goto out1;
}
}
}
SDSSC_ERROR) {
rval = 4;
}
}
out1:
/*
* Remove the set lock on those nodes that had the set locked
* max_node will either be MD_MAXSIDES or array index of the last
* node contacted (or rather failed to contact) for traditional
* diskset. For a MN diskset, max_node is the node_id of the node
* that failed the lock.
*/
if (MD_MNSET_DESC(sd)) {
while (nd) {
continue;
}
break;
mdclrerror(&xep);
continue;
}
if (rval == 0)
rval = 5;
}
}
} else {
for (i = 0; i < max_node; i++) {
/* Skip empty slots */
continue;
mdclrerror(&xep);
continue;
}
if (rval == 0)
rval = 5;
}
}
}
return (rval);
}
int
)
{
/*CONSTCOND*/
while (1) {
else
break;
/*
* Run to the end of the list
*/
/* void */;
sizeof (mddb_dtag_t));
}
return (0);
}
/*
* return drivename get by key
*/
int flags,
)
{
char *nm;
return (NULL);
}
/* get namespace info */
if (MD_MNSET_DESC(sd)) {
return (NULL);
} else {
return (NULL);
}
/* get device name */
if (flags & PRINT_FAST) {
return (NULL);
}
} else {
return (NULL);
}
}
/* make sure it's OK */
return (NULL);
/* get drivename */
/*
* Skip the following devid check if dnp is did device
* The device id is disabled for did device due to the
* lack of minor name support in the did driver. The following
* devid code path can set and propagate the error and
* eventually prevent did disks from being added to the
* diskset under SunCluster systems
*/
goto out;
}
/* Also, Skip the check if MN diskset, no devid's */
if (MD_MNSET_DESC(sd)) {
goto out;
}
/*
* Get the devid associated with the key.
*
* If a devid was returned, it MUST be valid even in
* the case where a device id has been "updated". The
* "update" of the device id may have occured due to
* a firmware upgrade.
*/
!= NULL) {
} else {
/*
* It is okay if replica is not in devid mode
*/
mdclrerror(ep);
goto out;
}
/*
* devid is missing so this means that we have
* just upgraded from a configuration where
* devid's were not used so try to add in
* the devid and requery.
*/
ep) < 0)
return (NULL);
return (NULL);
}
out:
if (flags & MD_BYPASS_DAEMON)
return (dnp);
return (NULL);
/* return success */
return (dnp);
}
void
{
Free(p);
}
}
int flags,
)
{
return (NULL);
}
)
{
mdnamelist_t *p;
return (NULL);
return (dd);
}
int flags,
)
{
return (NULL);
return (NULL);
}
int
)
{
int bool;
int i;
return (-1);
if (MD_MNSET_DESC(sd)) {
while (nd) {
/* If node isn't alive, can't own diskset */
continue;
}
/*
* If can't communicate with rpc.metad, then mark
* this node as not an owner. That node may
* in fact, be an owner, but without rpc.metad running
* that node can't do much.
*/
} else if (bool == TRUE) {
} else {
}
}
return (0);
}
/* Rest of code handles traditional disksets */
for (i = 0; i < MD_MAXSIDES; i++)
return (-1);
if (bool == TRUE)
return (0);
}
char *
mynode(void)
{
static int done = 0;
if (! done) {
assert(0);
}
done = 1;
}
}
int
{
int i;
for (i = 0; i < cnt; i++)
return (TRUE);
return (FALSE);
}
/*
* meta_get_reserved_names
* returns an mdnamelist_t of reserved slices
* reserved slices are those that are used but don't necessarily
* show up as metadevices (ex. reserved slice for db in sets, logs)
*/
/*ARGSUSED*/
int
mdnamelist_t **nlpp,
int options,
md_error_t *ep)
{
int count = 0;
if (metaislocalset(sp))
goto out;
count = -1;
goto out;
}
/* db in for sets on reserved slice */
/*
* Add the name struct to the end of the
* namelist but keep a pointer to the last
* element so that we don't incur the overhead
* of traversing the list each time
*/
count++;
else
count = -1;
}
/* now find logs */
count = -1;
goto out;
}
count = -1;
goto out;
}
/*
* Add the name struct to the end of the
* namelist but keep a pointer to the last
* element so that we don't incur the overhead
* of traversing the list each time
*/
}
}
out:
return (count);
}
/*
* Entry point to join a node to MultiNode diskset.
*
* Validate host in diskset.
* - Should be in membership list from API
* - Should not already be joined into diskset.
* - Set must have drives
* in the local mddb since no node or drive can be added to the MNset
* unless all drives and nodes are available. Reconfig steps will
* resync all ALIVE nodes in case of panic in critical areas.
*
* Lock down the set.
* Verify host is a member of this diskset.
* If drives exist in the configuration, load the mddbs.
* Set this node to active by notifying master if one exists.
* If this is the first node active in the diskset, this node
* becomes the master.
* Unlock the set.
*
* Mirror Resync:
* If this node is the last node to join the set and clustering
* isn't running, then start the 'metasync -r' type resync
* on all mirrors in this diskset.
* If clustering is running, this resync operation will
* be handled by the reconfig steps and should NOT
* be handled during a join operation.
*
* There are multiple return values in order to assist
* the join operation of all sets in the metaset command.
*
* Return values:
* 0 - Node successfully joined to set.
* -1 - Join attempted but failed
* - any failure from libmeta calls
* - node not in the member list
* -2 - Join not attempted since
* - this set had no drives in set
* - this node already joined to set
* - set is not a multinode set
* -3 - Node joined to STALE set.
*/
extern int
)
{
int rval = 0;
int master_flag = 0;
int clear_nr_flags = 0;
int stale_set = 0;
int rb_flags = 0;
int stale_bool = FALSE;
int suspendall_flag = 0;
int suspend1_flag = 0;
int send_reinit = 0;
return (-1);
}
/* Must be a multinode diskset */
if (!MD_MNSET_DESC(sd)) {
return (-2);
}
/* Verify that the node is ALIVE (i.e. is in the API membership list) */
return (-1);
}
/* Make sure we are blocking all signals */
mdclrerror(&xep);
/*
* Lock the set on current set members.
* For MN diskset lock_set and SUSPEND are used to protect against
* other meta* commands running on the other nodes.
*/
while (nd) {
continue;
}
rval = -1;
goto out;
}
}
/*
* Lock out other meta* commands by suspending
* class 1 messages across the diskset.
*/
while (nd) {
continue;
}
rval = -1;
goto out;
}
suspend1_flag = 1;
}
/*
* Verify that this host is a member (in the host list) of the set.
*/
while (nd) {
break;
}
}
if (!nd) {
rval = -1;
goto out;
}
/*
* Need to return failure if host is already 'joined'
* into the set. This is done so that if later the user
* issues a command to join all sets and a failure is
* encountered - that the resulting cleanup effort
* (withdrawing from all sets that were joined
* during that command) won't withdraw from this set.
*/
rval = -2;
goto out2;
}
/*
* Call metaget_setownership that calls each node in diskset and
* marks in set descriptor if node is an owner of the set or not.
* metaget_setownership checks to see if a node is an owner by
* checking to see if that node's kernel has the mddb loaded.
* If a node had panic'd during a reconfig or an
* records may not reflect the current state of the diskset,
* so calling metaget_setownership is the safest thing to do.
*/
rval = -1;
goto out;
}
/* If first active member of diskset, become the master. */
while (nd) {
break;
}
master_flag = 1;
/*
* If not first active member of diskset, then get the
* master information from a node that is already joined
* and set the master information for this node. Be sure
* that this node (the already joined node) has its own
* join flag set. If not, then this diskset isn't currently
* consistent and shouldn't allow a node to join. This diskset
* inconsistency should only occur when a node has panic'd in
* the set while doing a metaset operation and the sysadmin is
* attempting to join a node into the set. This inconsistency
* will be fixed during a reconfig cycle which should be occurring
* soon since a node panic'd.
*
* If unable to get this information from an owning node, then
* this diskset isn't currently consistent and shouldn't
* allow a node to join.
*/
if (!master_flag) {
/* get master information from an owner (joined) node */
rval = -1;
goto out;
}
/* Verify that owner (joined) node has its own JOIN flag set */
while (nr) {
nd->nd_nodename);
rval = -1;
goto out;
}
}
/*
* Does master have set marked as STALE?
* If so, need to pass this down to kernel when
* this node snarfs the set.
*/
rval = -1;
goto out;
}
/* set master information in my rpc.metad's set record */
rval = -1;
goto out;
}
/* set master information in my cached set desc */
while (nd2) {
break;
}
}
/*
* Set the node flags in mynode's rpc.metad node records for
* the nodes that are in the diskset. Can use my sd
* since earlier call to metaget_setownership set the
* owner flags based on whether that node had snarfed
* the MN diskset mddb. Reconfig steps guarantee that
* return of metaget_setownership will match the owning
* node's owner list except in the case where a node
* has just panic'd and in this case, a reconfig will
* be starting immediately and the owner lists will
* be sync'd up by the reconfig.
*
* Flag of SET means to take no action except to
* set the node flags as given in the nodelist linked list.
*/
rval = -1;
goto out;
}
}
/*
* Read in the mddb if there are drives in the set.
*/
/* No drives in list */
rval = -1;
goto out;
}
rval = -2;
goto out;
}
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Start by suspending rpc.mdcommd (which drains it of all messages),
* then change the nodelist followed by a reinit and resume.
*/
while (nd) {
continue;
}
rval = -1;
goto out;
}
suspendall_flag = 1;
}
/* Set master in my set record in rpc.metad */
if (master_flag) {
rval = -1;
goto out;
}
}
/*
* Causes mddbs to be loaded into the kernel.
* Set the force flag so that replica locations can be
* loaded into the kernel even if a mediator node was
* unavailable. This allows a node to join an MO
* diskset when there are sufficient replicas available,
* but a mediator node in unavailable.
*/
"Host not able to start diskset."));
rval = -1;
goto out;
}
rval = -1;
goto out;
}
/*
* Set rollback flags to 1 so that halt_set is called if a failure
* is seen after this point. If snarf_set fails, still need to
* call halt_set to cleanup the diskset.
*/
rb_flags = 1;
/* Starts the set */
/*
* Don't fail join, STALE means that set has
* < 50% mddbs.
*/
stale_set = 1;
/* If snarf failed, but no error was set - set it */
rval = -1;
goto out;
/*
* Don't fail join if ACCOK; ACCOK means that mediator
* provided extra vote.
*/
rval = -1;
goto out;
}
}
/* Did set really get snarfed? */
/* If snarf failed, but no error was set - set it */
}
"Host not able to start diskset."));
rval = -1;
goto out;
}
/* Change to nodelist so need to send reinit to rpc.mdcommd */
send_reinit = 1;
/* If first node to enter set, setup master and clear change log */
if (master_flag) {
/* Set master in my locally cached set descriptor */
/*
* If first node to join set, then clear out change log
* entries. Change log entries are only needed when a
* change of master is occurring in a diskset that has
* multiple owners. Since this node is the first owner
* of the diskset, clear the entries.
*
* Only do this if we are in a single node non-SC3.x
* situation.
*/
if (meta_mn_singlenode() &&
"Unable to reset changelog."));
rval = -1;
goto out;
}
}
/* Set my locally cached flag */
/*
* Set this node's own flag on all joined nodes in the set
* (including my node).
*/
clear_nr_flags = 1;
while (nd) {
continue;
}
rval = -1;
goto out;
}
}
out:
/*
* If rollback flag is 1, then node was joined to set.
* Since an error occurred, withdraw node from set in
* order to rollback to before command was run.
* Need to preserve ep so that calling function can
* get error information.
*/
if (rb_flags == 1) {
mdclrerror(&xep);
}
}
/*
* If error, reset master to INVALID.
* Ignore error since (next) first node to successfully join
* will set master on all nodes.
*/
MD_MN_INVALID_NID, &xep);
mdclrerror(&xep);
/* Reset master in my locally cached set descriptor */
sd->sd_mn_am_i_master = 0;
/*
* If nr flags set on other nodes, reset them.
*/
if (clear_nr_flags) {
while (nd) {
continue;
}
mdclrerror(&xep);
}
/* Reset my locally cached flag */
}
}
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Send reinit command to mdcommd which forces it to get
* fresh set description.
*/
if (send_reinit) {
/* Send reinit */
while (nd) {
continue;
}
/* Class is ignored for REINIT */
/*
* We are here because we failed to resume
* rpc.mdcommd. However we potentially have
* an error from the previous call
* If the previous call did fail, we capture
* that error and generate a perror with
* the string, "Unable to resume...".
* Setting rval to -1 ensures that in the
* next iteration of the loop, ep is not
* clobbered.
*/
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
"Unable to reinit rpc.mdcommd."));
}
}
}
out2:
/*
* Unlock diskset by resuming messages across the diskset.
* Just resume all classes so that resume is the same whether
* just one class was locked or all classes were locked.
*/
if ((suspend1_flag) || (suspendall_flag)) {
while (nd) {
continue;
}
/*
* We are here because we failed to resume
* rpc.mdcommd. However we potentially have
* an error from the previous call
* If the previous call did fail, we capture
* that error and generate a perror with
* the string, "Unable to resume...".
* Setting rval to -1 ensures that in the
* next iteration of the loop, ep is not
* clobbered.
*/
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
"Unable to resume rpc.mdcommd."));
}
}
}
/*
* Unlock set. This flushes the caches on the servers.
*/
while (nd) {
continue;
}
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
}
}
/*
* If this node is the last to join the diskset and clustering isn't
* running, then resync the mirrors in the diskset. We have to wait
* until all nodes are joined so that the status gets propagated to
* all of the members of the set.
* Ignore any error from the resync as the join function shouldn't fail
* because the mirror resync had a problem.
*
* Don't start resync if set is stale.
*/
(stale_set != 1)) {
while (nd) {
break;
}
/*
* nd set to NULL means that we have no nodes in the set that
* haven't joined. In this case we start the resync.
*/
mdclrerror(&xep);
}
}
/* Update ABR state for all soft partitions */
mdclrerror(&xep);
/*
* call metaflushsetnames to reset local cache for master and
* node information.
*/
/* release signals back to what they were on entry */
mdclrerror(&xep);
/*
* If no error and stale_set is set, then set ep back
* to ep from snarf_set call and return -3. If another error
* occurred and rval is not 0, then that error would have
* caused the node to be withdrawn from the set and would
* have set ep to that error information.
*/
return (-3);
}
return (rval);
}
/*
* Entry point to withdraw a node from MultiNode diskset.
*
* Validate host in diskset.
* - Should be joined into diskset.
* in the local mddb since no node or drive can be added to the MNset
* unless all drives and nodes are available. Reconfig steps will
* resync all ALIVE nodes in case of panic in critical areas.
*
* Lock down the set.
* Verify that drives exist in configuration.
* Verify host is a member of this diskset.
* Verify host is an owner of the diskset (host is joined to diskset).
* Only allow withdrawal of master node if master node is the only joined
* in the diskset.
* Halt the diskset on this node.
* Reset Master on this node.
* Updated node flags that this node with withdrawn.
* Unlock the set.
*
* Return values:
* 0 - Node successfully withdrew from set.
* -1 - Withdrawal attempted but failed
* - any failure from libmeta calls
* - node not in the member list
* -2 - Withdrawal not attempted since
* - this set had no drives in set
* - this node not joined to set
* - set is not a multinode set
*/
extern int
)
{
md_drive_desc *dd = 0;
int rval = 0;
int set_halted = 0;
int suspendall_flag = 0;
int suspend1_flag = 0;
int node_id_list[1];
int send_reinit = 0;
return (-1);
}
/* Must be a multinode diskset */
if (!MD_MNSET_DESC(sd)) {
return (-1);
}
/* Make sure we are blocking all signals */
mdclrerror(&xep);
/*
* Lock the set on current set members.
* For MN diskset lock_set and SUSPEND are used to protect against
* other meta* commands running on the other nodes.
*/
while (nd) {
continue;
}
rval = -1;
goto out;
}
}
/*
* Lock out other meta* commands by suspending
* class 1 messages across the diskset.
*/
while (nd) {
continue;
}
rval = -1;
goto out;
}
suspend1_flag = 1;
}
/* Get list of drives - needed in case of failure */
/* Error getting drives in list */
rval = -1;
goto out2;
}
/* no drives in list */
rval = -2;
goto out2;
}
/*
* Verify that this host is a member (in the host list) of the set.
*/
while (nd) {
break;
}
}
if (!nd) {
rval = -1;
goto out2;
}
/*
* Call metaget_setownership that calls each node in diskset and
* marks in set descriptor if node is an owner of the set or not.
* metaget_setownership checks to see if a node is an owner by
* checking to see if that node's kernel has the mddb loaded.
* If a node had panic'd during a reconfig or an
* records may not reflect the current state of the diskset,
* so calling metaget_setownership is the safest thing to do.
*/
rval = -1;
goto out2;
}
/*
* Verify that this node is joined
* to diskset (i.e. is an owner of the diskset).
*/
rval = -2;
goto out2;
}
/*
* For a MN diskset, only withdraw master if it is
* the only joined node.
*/
while (nd) {
/* Skip my node since checking for other owners */
continue;
}
/* If another owner node if found, error */
rval = -1;
goto out2;
}
}
}
/*
* Is current set STALE?
*/
(void) memset(&c, 0, sizeof (c));
c.c_id = 0;
rval = -1;
goto out;
}
if (c.c_flags & MDDB_C_STALE) {
stale_bool = TRUE;
}
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Start by suspending rpc.mdcommd (which drains it of all messages),
* then change the nodelist followed by a reinit and resume.
*/
while (nd) {
continue;
}
rval = -1;
goto out;
}
suspendall_flag = 1;
}
/*
* Withdraw the set - halt set.
* This will fail if any I/O is occuring to any metadevice which
* includes a resync to a mirror metadevice.
*/
set_halted = 1;
/* Was set actually halted? */
set_halted = 0;
}
rval = -1;
goto out;
}
/* Change to nodelist so need to send reinit to rpc.mdcommd */
send_reinit = 1;
/* Reset master on withdrawn node */
MD_MN_INVALID_NID, ep)) {
rval = -1;
goto out;
}
/* Mark my node as withdrawn and send to other nodes */
while (nd) {
continue;
}
rval = -1;
goto out;
}
}
/*
* If withdrawn node is a mirror owner, reset mirror owner
* to NULL. If an error occurs, print a warning and continue.
* Don't fail metaset because of mirror owner reset problem since
* next node to grab mirror will resolve this issue.
* Before next node grabs mirrors, metaset will show the withdrawn
* node as owner which is why an attempt to reset the mirror owner
* is made.
*/
while (nd) {
continue;
}
"Unable to reset mirror owner on node %s"),
nd->nd_nodename);
mdclrerror(&xep);
}
}
out:
if (rval == -1) {
/* Rejoin node - Mark node as joined and send to other nodes */
while (nd) {
continue;
}
mdclrerror(&xep);
}
}
/* Set master on withdrawn node */
mdclrerror(&xep);
}
/* Join set if halt_set had succeeded */
if (set_halted) {
/*
* Causes mddbs to be loaded into the kernel.
* Set the force flag so that replica locations can be
* loaded into the kernel even if a mediator node was
* unavailable. This allows a node to join an MO
* diskset when there are sufficient replicas available,
* but a mediator node in unavailable.
*/
mdclrerror(&xep);
}
/* If set previously stale - make it so at re-join */
mdclrerror(&xep);
mdclrerror(&xep);
}
}
}
/*
* Notify rpc.mdcommd on all nodes of a nodelist change.
* Send reinit command to mdcommd which forces it to get
* fresh set description.
*/
if (send_reinit) {
/* Send reinit */
while (nd) {
continue;
}
/* Class is ignored for REINIT */
/*
* We are here because we failed to resume
* rpc.mdcommd. However we potentially have
* an error from the previous call.
* If the previous call did fail, we
* capture that error and generate a perror
* withthe string, "Unable to resume...".
* Setting rval to -1 ensures that in the
* next iteration of the loop, ep is not
* clobbered.
*/
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
"Unable to reinit rpc.mdcommd."));
}
}
}
out2:
/*
* Unlock diskset by resuming messages across the diskset.
* Just resume all classes so that resume is the same whether
* just one class was locked or all classes were locked.
*/
if ((suspend1_flag) || (suspendall_flag)) {
while (nd) {
continue;
}
/*
* We are here because we failed to resume
* rpc.mdcommd. However we potentially have
* an error from the previous call
* If the previous call did fail, we capture
* that error and generate a perror with
* the string, "Unable to resume...".
* Setting rval to -1 ensures that in the
* next iteration of the loop, ep is not
* clobbered.
*/
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
"Unable to resume rpc.mdcommd."));
}
}
}
/*
* Unlock set. This flushes the caches on the servers.
*/
while (nd) {
continue;
}
if (rval == 0)
else
mdclrerror(&xep);
rval = -1;
}
}
/*
* call metaflushsetnames to reset local cache for master and
* node information.
*/
/* release signals back to what they were on entry */
mdclrerror(&xep);
return (rval);
}
/*
* Update nodelist with cluster member information.
* A node not in the member list will be marked
* as not ALIVE and not OWN.
* A node in the member list will be marked ALIVE, but
* the OWN bit will not be changed.
*
* If mynode isn't in the membership list, fail causing
* another reconfig cycle to be started since a non-member
* node shouldn't be taking part in the reconfig cycle.
*
* Return values:
* 0 - No problem.
* 1 - Any failure including RPC failure to my node.
*/
int
)
{
int rval = 0;
/*
* Walk through nodelist, checking to see if each
* node is in the member list.
* If node is not a member, reset ALIVE and OWN node flag.
* If node is a member, set ALIVE.
* If mynode's OWN flag gets reset, then halt the diskset on this node.
*/
while (nd) {
while (nl2) {
/* If node is in member list, set ALIVE */
break;
} else {
}
/* node is not in member list, mark !ALIVE and !OWN */
/* If node is mynode, then halt set if needed */
/*
* This shouldn't happen, but just
* in case... Any node not in the
* membership list should be dead and
* not running reconfig step1.
*/
mdclrerror(&xep);
}
}
/*
* Return failure since this node
* (mynode) is not in the membership
* list, but process the rest of the
* nodelist first so that rpc.metad
* can be updated with the latest
* membership information.
*/
rval = 1;
}
}
}
}
/* Send this information to rpc.metad */
/* Return failure if can't send node flags to rpc.metad */
if (rval == 0) {
rval = 1;
}
}
return (rval);
}
/*
* Choose master determines the master for a diskset.
* Each node determines the master on its own and
* adds this information to its local rpc.metad nodelist
* and also sends it to the kernel.
*
* Nodelist in set descriptor (sd) is sorted in
* monotonically increasing sequence of nodeid.
*
* Return values:
* 0 - No problem.
* 205 - There was an RPC problem to another node.
* -1 - There was an error. This could be an RPC error to my node.
* This is a catastrophic failure causing node to panic.
*/
int
)
{
int is_owner;
int lowest_alive_nodeid = 0;
int rval = 0;
/*
* Is current node joined to diskset?
* Don't trust flags, really check to see if mddb is snarfed.
*/
/*
* If a node is joined to the diskset, this node checks
* to see if the current master of the diskset is valid and
* is still in the membership list (ALIVE) and is
* still joined (OWN). Need to verify if master is
* really joined - don't trust the flags. (Can trust
* ALIVE since set during earlier part of reconfig cycle.)
* If the current master is valid, still in the membership
* list and joined, then master is not changed on this node.
* Just return.
*
* Verify that nodeid is valid before accessing masternode.
*/
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
sd->sd_mn_master_nodeid)) {
return (205);
} else {
/* Any other failure */
return (-1);
}
} else {
TEXT_DOMAIN, "Set %s previous "
"master chosen %s (%d): %s"),
start_time));
/* Previous master is ok - done */
return (0);
}
}
}
/*
* If current master is no longer in the membership list or
* is no longer joined, then this node uses the following
* algorithm:
* - node calls RPC routine clnt_ownset to get latest
* information on which nodes are owners of diskset.
* clnt_ownset checks on each node to see if its kernel
* has that diskset snarfed.
*/
while (nd) {
/* Don't consider node that isn't in member list */
continue;
}
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
return (205);
} else {
/* Any other failure */
return (-1);
}
}
/*
* Set owner flag for each node based on whether
* that node really has a diskset mddb snarfed in
* or not.
*/
else
}
/*
* - node walks through nodelist looking for nodes that are
* owners of the diskset that are in the membership list.
* - for each owner, node calls RPC routine clnt_getset to
* see if that node has its node record set to OK.
* - If so, master is chosen to be this owner node.
*/
while (nd) {
/* Don't consider node that isn't in member list */
continue;
}
/* Don't consider a node that isn't an owner */
continue;
}
/* Does node has its own node record set to OK? */
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
return (205);
} else {
/* Any other failure */
return (-1);
}
}
while (nr) {
/* Found a master */
(md_set_record *)mnsr);
goto found_master;
}
}
}
}
/*
* - If no owner node has its own node record on its own node
* set to OK, then this node checks all of the non-owner
* nodes that are in the membership list.
* - for each non-owner, node calls RPC routine clnt_getset to
* see if that node has its node record set to OK.
* - If set doesn't exist, don't choose node for master.
* - If so, master is chosen to be this non-owner node.
*
*/
while (nd) {
/* Don't consider node that isn't in member list */
continue;
}
/* Only checking non-owner nodes this time around */
continue;
}
/* Does node has its own node record set to OK? */
/*
* If set doesn't exist on non-owner node,
* don't consider this node for master.
*/
continue;
} else if ((mdanyrpcerror(ep)) &&
/* RPC failure to another node */
return (205);
} else {
/* Any other failure */
return (-1);
}
}
while (nr) {
/* Found a master */
(md_set_record *)mnsr);
goto found_master;
}
}
}
}
/*
* - If no node can be found that has its own node record on
* its node to be set to OK, then all alive nodes
* were in the process of being added to or deleted
* from set. Each alive node will remove all
* information pertaining to this set from its node.
*
* If all nodes in set are ALIVE, then call sdssc end routines
* since set was truly being initially created or destroyed.
*/
goto delete_set;
} else {
/*
* If node is not joined to diskset, then this
* node uses the following algorithm:
* - If unjoined node doesn't have a node record for itself,
* just delete the diskset since diskset was in the
* process of being created.
* - node needs to find master of diskset before
* reconfig cycle, if a master existed.
* - node calls RPC routine clnt_ownset to get latest
* information on which nodes are owners of diskset.
* clnt_ownset checks on each node to see if its
* kernel has that diskset snarfed.
*/
/*
* Is my node in the set description?
* If not, delete the set from this node.
* sr2setdesc sets sd_mn_mynode pointer to the node
* descriptor for this node if there was a node
* record for this node.
*
*/
goto delete_set;
}
while (nd) {
/* Don't consider node that isn't in member list */
continue;
}
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
return (205);
} else {
/* Any other failure */
return (-1);
}
}
/*
* Set owner flag for each node based on whether
* that node really has a diskset mddb snarfed in
* or not.
*/
else
}
/*
* - node walks through nodelist looking for nodes that
* are owners of the diskset that are in
* the membership list.
* - for each owner, node calls RPC routine clnt_getset to
* see if that node has a master set and to get the
* diskset description.
* - If the owner node has a set description that doesn't
* include the non-joined node in the nodelist, this node
* removes its set description of that diskset
* (i.e. removes the set from its local mddbs). This is
* handling the case of when a node was removed from a
* diskset while it was not in the cluster membership
* list.
* - If that node has a master set and the master is in the
* membership list and is an owner, then either this was
* the master from before the reconfig cycle or this
* node has already chosen a new master - either way,
* the master value is valid as long as it is in the
* membership list and is an owner
* - master is chosen to be owner node's master
*/
while (nd) {
/* Don't consider node that isn't in member list */
continue;
}
/* Don't consider a node that isn't an owner */
continue;
}
/* Get owner node's set record */
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
return (205);
} else {
/* Any other failure */
return (-1);
}
}
/* Is this node in the owner node's set record */
while (nr) {
break;
}
}
/* my node not found - delete set */
goto delete_set;
}
/* Is owner's node's master valid? */
if (master_nodeid == MD_MN_INVALID_NID) {
continue;
}
while (nd2) {
goto found_master;
}
}
}
/*
* - If no owner node has a valid master, then follow
* algorithm of when a node is joined to the diskset.
* - node walks through nodelist looking for nodes that are
* owners of the diskset that are in the membership list.
* - for each owner, node calls RPC routine clnt_getset to
* see if that node has its node record set to OK.
* - If so, master is chosen to be this owner node.
*/
while (nd) {
/* Don't consider node that isn't in member list */
continue;
}
/* Don't consider a node that isn't an owner */
continue;
}
/* Does node has its own node record set to OK? */
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
return (205);
} else {
/* Any other failure */
return (-1);
}
}
while (nr) {
/* Found a master */
(md_set_record *)mnsr);
goto found_master;
}
}
}
}
/*
* - If no owner node has its own node record on its own node
* set to OK, then this node checks all of the non-owner
* nodes that are in the membership list.
* - for each non-owner, node calls RPC routine clnt_getset to
* see if that node has its node record set to OK.
* - If set doesn't exist, don't choose node for master.
* - If this node doesn't exist in the nodelist on any of the
* non-owner nodes, this node removes its set description
* of that diskset (i.e. removes the set from its local
* mddbs). This is handling the case of when a node was
* removed from a diskset while it was not in the
* cluster membership list.
* - If non-owner node has its node record set to OK and if
* this node hasn't removed this diskset (step directly
* before this one), then the master is chosen to be this
* non-owner node.
*/
while (nd) {
/* Don't consider node that isn't in member list */
continue;
}
/* Don't consider owner nodes since none are OK */
continue;
}
/*
* Don't need to get nodelist from my node since
* this is where sd_nodelist was obtained.
*/
continue;
}
/*
* If node has already been decided against for
* master, then skip it.
*/
continue;
}
/*
* Does node in my nodelist have its own node
* record marked OK on its node? And does node
* in my nodelist exist on all other nodes?
* Don't want to choose a node for master unless
* that node is marked OK on its own node and that
* node exists on all other alive nodes.
*
* This is guarding against the case when several
* nodes are down and one of the downed nodes is
* deleted from the diskset. When the down nodes
* are rebooted into the cluster, you don't want
* any node to pick the deleted node as the master.
*/
/*
* If set doesn't exist on non-owner node,
* don't consider this node for master.
*/
continue;
} else if (mdanyrpcerror(ep)) {
/* RPC failure to another node */
return (205);
} else {
/* Any other failure */
return (-1);
}
}
/*
* Is my node in the nodelist gotten from the other
* node? If not, then remove the set from my node
* since set was deleted from my node while my node
* was out of the cluster.
*/
while (nr) {
break;
}
}
/* my node not found - delete set */
goto delete_set;
}
/* Is node being checked marked OK on its own node? */
while (nr) {
}
break;
}
}
/*
* If node being checked doesn't exist on its
* own node - don't choose it as master.
*/
}
/*
* Check every node in my node's nodelist against
* the nodelist gotten from the other node.
* If a node in my node's nodelist is not found in the
* other node's nodelist, then set the DEL flag.
*/
while (nd2) {
while (nr) {
break;
}
}
/* nd2 not found in other node's nodelist */
}
}
}
/*
* Rescan list look for node that has not been marked DEL.
* First node found is the master.
*/
while (nd) {
break;
}
continue;
}
if (nd) {
/* Found a master */
goto found_master;
}
/*
* - If no node can be found that has its own node record on
* its node to be set to OK, then all alive nodes
* were in the process of being added to or deleted
* from set. Each alive node will remove all
* information pertaining to this set from its node.
*
* If all nodes in set are ALIVE, then call sdssc end routines
* since set was truly being initially created or destroyed.
*/
goto delete_set;
}
"Set %s master chosen %s (%d): %s"),
return (-1);
}
rval = -1;
/* If this node is new master, set flag in this node's kernel */
/* Use magic to help protect ioctl against attack. */
"Setting new master flag for set %s: %s"),
/*
* Fail reconfig cycle if ioctl fails since it is critical
* to set new master flag.
*/
rval = -1;
}
}
if (rval == 0) {
rval = -1;
}
}
return (rval);
"Master not chosen, deleting set %s: %s"),
/*
* Remove all set information from this node:
* - node records for this set
* - drive records for this set
* - set record for this set
* (Only do this on this node since each node
* will do it for its own local mddb.)
*
* If all nodes in set are ALIVE, then
* the lowest numbered ALIVE nodeid in set
* (irregardless of whether an owner node or not) will
* sdssc_create_end(cleanup) if set was being created or
* sdssc_delete_end(cleanup) if set was being deleted.
* A node record with flag ADD denotes a set being
* created. A node record with flag DEL denotes a
* set being deleted.
*/
while (nd) {
/* Found a node that isn't alive */
break;
/* Is my node the lowest numbered ALIVE node? */
break;
}
}
/* All nodes ALIVE and this is the lowest nodeid */
lowest_alive_nodeid = 1;
}
return (-1);
}
/*
* If this node had been joined, withdraw and reset master.
*
* This could happen if a node was being added to or removed
* all other nodes in the diskset have left the cluster.
*/
if (sd->sd_mn_mynode) {
rval = -1;
goto out;
}
MD_MN_INVALID_NID, ep)) {
rval = -1;
goto out;
}
}
}
/*
* Remove side records for this node (side) from local mddb
* (clnt_deldrvs does this) if there are drives in the set.
*
* Don't need to mark this node as DEL since already marked as
* ADD or DEL (or this node would have been chosen as master).
* Don't need to mark other node records, drive records or
* set records as DEL. If a panic occurs during clnt_delset,
* these records will be deleted the next time this node
* becomes a member and goes through the reconfig cycle.
*/
/* Get the drive descriptors for this set */
/*
* Ignore and clear out any failures from
* metaget_drivedesc since a panic could have
* occurred when a node was partially added to a set.
*/
mdclrerror(ep);
}
} else {
rval = -1;
goto out;
}
}
/*
* Now, delete the set - this removes the node, drive
* and set records from the local mddb.
*/
rval = -1;
goto out;
}
out:
/*
* Ignore errors from unlock of set since set is no longer
* known (if clnt_delset worked).
*/
mdclrerror(&xep);
}
/*
* If this node is the lowest numbered nodeid then
* call sdssc_create/delete_end depending on whether
* this node is marked as ADD or DEL in the node record.
*/
if (lowest_alive_nodeid) {
}
/* Finished with this set -- return */
return (rval);
}
/*
* Reconfig step to choose a new master for all MN disksets.
* Return values:
* 0 - Everything is great.
* 1 - This node failed to reconfig.
* 205 - Cause another reconfig due to a nodelist problem
* or RPC failure to another node
*/
int
)
{
int nodecnt;
int rval = 0;
int start_node_delayed = 0;
"Unable to get number of sets"));
return (1);
}
/*
* Get membershiplist from API routine. If there's
* an error, return a 205 to cause another reconfig.
*/
return (205);
}
/* No set for this setno - continue */
mdclrerror(ep);
continue;
} else {
/*
* If encountered an RPC error from my node,
* then immediately fail.
*/
if (mdanyrpcerror(ep)) {
return (1);
}
/* Can't get set information */
"Unable to get information for "
"set number %d"), setno);
mdclrerror(ep);
continue;
}
}
/* If setname is there, set desc should exist. */
/*
* If encountered an RPC error from my node,
* then immediately fail.
*/
if (mdanyrpcerror(ep)) {
return (1);
}
"Unable to get set %s desc information"),
mdclrerror(ep);
continue;
}
/* Only reconfig MN disksets */
if (!MD_MNSET_DESC(sd)) {
continue;
}
"Begin choose master for set %s: %s"),
/* Update nodelist with member information. */
/*
* If encountered an RPC error from my node,
* then immediately fail.
*/
if (mdanyrpcerror(ep)) {
return (1);
}
mdclrerror(ep);
continue;
}
/*
* If all nodes in a cluster are starting, then
* all nodes will attempt to contact all other nodes
* to determine a master node. This can lead to a
* problem where node 1 is trying to contact the rpc.metad
* node 2 and node 2 is trying to contact the rpc.metad
* on node 1 -- and this causes the rpc call to fail
* on both nodes and causes a new reconfig cycle.
*
* In order to break this problem, a newly starting node
* will delay a small amount of time (nodeid mod 4 seconds)
* and will then run the code to choose a master for the
* first set. Delay will only be done once regardless of the
* number of sets.
*/
if (start_node_delayed == 0) {
/* Use magic to help protect ioctl against attack. */
}
start_node_delayed = 1;
}
/* Choose master for this set */
if (rval == -1) {
return (1);
} else if (rval == 205) {
return (205);
}
/* Send new nodelist to rpc.mdcommd */
"Choose master for set %s completed: %s"),
}
/*
* This is to recover from the situation where the master died
* for a MN diskset.
* If a failure occurs return a 1 which will force this node to
* not resumed.
*/
setno = 0; /* 0 means all MN sets */
return (1);
}
/* Free the nodelist */
if (nodecnt)
return (0);
}
/*
* meta_mnsync_user_records will synchronize the diskset user records across
* all nodes in the diskset. The diskset user records are stored in
* each node's local set mddb.
*
* This needs to be done even if there is no master change during the
* reconfig cycle since this routine should clean up any mess left by
* the untimely termination of a metaset or metadb command (due to a
* node panic or to user intervention).
*
* Caller is the Master node.
*
* Returns 0 - Success
* 205 - Failure during RPC to another node
* -1 - Any other failure and ep is filled in.
*/
int
)
{
int found_my_nr;
int all_drives_ok;
int rval = 0;
int max_genid = 0;
int num_alive_nodes, num_alive_nodes_del = 0;
int set_locked = 0;
char *anode[1];
/*
* Sync up node records first.
* Construct a master nodelist using the nodelist from this
* node's rpc.metad node records and then setting the state of each
* node following these rules:
* - If a node record is marked OK on its node, mark it OK
* in the master nodelist (and later OK on all nodes)
* If a node record is also marked OWN on its node,
* mark it OWN in the master nodelist.
* - If a node record is not marked OK on its node, then mark
* it as DEL in the master list (later deleting it)
* - If node record doesn't exist on that node, then mark it DEL
* (later deleting it)
* - If set record doesn't exist on that node, mark node as DEL
* - If a node record doesn't exist on all nodes, then mark it DEL
* - If a node is not ALIVE, then
* - If that node marked DEL on any node - mark it DEL
* in master list but leave in nodelist
* - If that node is marked as ADD on any node, mark it
* ADD in the master list but leave in nodelist
* - When that node returns to the living, the DEL
* node record will be removed and the ADD node
* record may be removed if marked ADD on that
* node.
* The key rule is to not remove a node from the nodelist until
* that node record is removed from its own node. Do not want to
* remove a node's record from all other nodes and then have
* that node have its own record marked OK so that a node will pick
* a different master than the other nodes.
*
* Next,
* If node is ALIVE and node record is marked DEL in master nodelist,
* remove node from set.
* If node is ALIVE and node record is marked OK in master nodelist,
* mark it OK on all other nodes.
* If node is not ALIVE and node record is marked DEL in master
* nodelist, mark it DEL on all other nodes.
* If node is not ALIVE and node record is marked ADD in master,
* nodelist, mark it ADD on all other nodes.
*/
return (-1);
}
/*
* Walk through nodelist creating a master nodelist.
*/
num_alive_nodes = 0;
while (nd) {
continue;
}
/* set doesn't exist, mark node as DEL */
continue;
} else {
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
}
/* Find biggest genid in records for this diskset */
while (dr) {
/* Find biggest genid in records for this diskset */
}
}
found_my_nr = 0;
/* nr is the list of node recs from nd_nodename node */
while (nr) {
/* Find biggest genid in records for this diskset */
/* For each node record, is it in master list? */
while (nd2) {
break;
}
/*
* Found node record not in master list -- add it
* to list marking it as DEL since node record
* should exist on all nodes unless a panic occurred
* during addition or deletion of host to diskset.
*/
nr->nr_nodename);
continue;
}
/*
* Is this the node record for the node that
* we requested the set desc from?
* If so, check if node has its own node record
* marked OK. If marked OK, check for the OWN bit.
*/
found_my_nr = 1;
/*
* If node record is marked OK
* on its own node, then mark it OK
* in the master list. Node record
* would have to exist on all nodes
* in the ADD state before it could
* be put into the OK state.
*/
~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
/*
* Mark own in master list as marked
* on own node.
*/
else
} else {
/* Otherwise, mark node as DEL */
}
}
/*
* If node is not ALIVE and marked DEL
* on any node, make it DEL in master list.
* If node is not ALIVE and marked ADD
* on any node, make it ADD in master list
* unless node record has already been marked DEL.
*/
/* If not DEL - mark it ADD */
}
}
/* Could already be ADD - make it DEL */
}
}
}
/*
* If a node record doesn't exist on its own node,
* then mark node as DEL.
*/
if (found_my_nr == 0) {
}
/*
* If node is OK - put mnsr onto master_mnsr_node list for
* later use when syncing up the drive records in the set.
*/
} else {
}
}
"Master nodelist created for set %s: %s"),
/*
* Send master nodelist to the rpc.metad on all nodes (including
* myself) and each node will update itself. This will set the
* ADD and DEL flags on each node as setup in the master nodelist.
* Don't send nodelist to node where set doesn't exist.
*/
while (nd) {
continue;
}
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
}
/*
* Now, delete nodes that need to be deleted.
*/
rval = -1;
goto out;
}
}
/*
* May be doing lots of RPC commands to the nodes, so lock the
* ALIVE members of the set since most of the rpc.metad routines
* require this for security reasons.
*/
while (nd) {
/* Skip non-alive nodes and node without set */
continue;
}
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
set_locked = 1;
}
while (nd) {
/* Skip non-alive nodes */
continue;
}
/*
* Delete this node rec from all ALIVE nodes in diskset.
*/
while (nd2) {
/* Skip non-alive nodes and node without set */
continue;
}
/* This is a node being deleted from set */
/* Mark set record as DEL */
/* RPC failure to !my node */
if ((mdanyrpcerror(ep)) &&
(sd->sd_mn_mynode->
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
/* RPC failure to !my node */
if ((mdanyrpcerror(ep)) &&
(sd->sd_mn_mynode->
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
ep) == -1) {
/* RPC failure to !my node */
if ((mdanyrpcerror(ep)) &&
(sd->sd_mn_mynode->
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
} else {
/*
* Delete host from sets on hosts
* not being deleted.
*/
/* RPC failure to !my node */
if ((mdanyrpcerror(ep)) &&
(sd->sd_mn_mynode->
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
"Deleted node %s (%d) on node %s "
"from set %s: %s"),
gethrtime() - start_time));
}
}
}
}
while (nd) {
/* Skip non-alive nodes and node without set */
continue;
}
/* If RPC failure to another node return 205 */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
}
set_locked = 0;
"Nodelist syncronization complete for set %s: %s"),
/*
* If all alive nodes have been deleted from set, just
* return since nothing else can be done until non-alive
* nodes (if there are any) rejoin the cluster.
*/
if (num_alive_nodes == num_alive_nodes_del) {
rval = 0;
goto out;
}
/*
* Sync up drive records.
*
* If a node panic'd (or metaset command was killed) during the
* addition or deletion of a drive to the diskset, the nodes
* may have a different view of the drive list. During cleanup
* of the drive list during reconfig, a drive will be deleted
* from the list if the master node sees that the drive has been
* marked in the ADD state on any node or is marked in the DEL state
* on all nodes.
* This cleanup must occur even if all nodes in the cluster are
* not part of the cluster so that all nodes have the same view
* of the drivelist.
* Then if the entire cluster goes down and comes back up, the
* new master node could be a node that wasn't in the cluster when
* the node was deleted. This could lead to a situation where the
* master node thinks that a drive is OK, but this drive isn't
* known to the other nodes.
* This situation can also occur during the addition of a drive
* where a node has the drive marked OK, but the node executing the
* metaset command enountered a failure before marking that drive OK
* on the rest of the nodes. If the node with the OK drive then
* panics, then rest of the nodes will remove that drive marked ADD
* and when the node with the OK drive rejoins the cluster, it will
* have a drive marked OK that is unknown by the other nodes.
*
* There are 2 situations to consider:
* A) Master knows about a drive that other nodes don't know about.
* B) At least one slave node knows about a drive that the master
* node doesn't know about.
*
* To handle these situations the following steps are followed:
* 1) Count number of drives known by this master node and the
* other slave nodes.
* If all nodes have the same number of drives and the master has
* all drives marked OK, then skip to step4.
*
* 2) If a node has less drives listed than the master, the master
* must get the drive descriptor list from that node so that
* master can determine which drive it needs to delete from that
* node. Master must get the drive descriptor list since the
* drive record list does not contain the name of the drive, but
* only a key and the key can only be interprested on that other
* node.
*
* 3) The master will then create the master drive list by doing:
* - Master starts with drive list known by master.
* - Any drive marked ADD will be removed from the list.
* - Any drive not known by another node (from step2) will be
* removed from the drive list.
* - If a drive is marked DEL on the master, the master must
* verify that the drive record is marked DEL on all nodes.
* If any node has the drive record marked OK, mark it OK
* on the master. (The reason why is described below).
*
* 4) The master sends out the master drive list and the slave
* nodes will force their drive lists to match the master
* drive list by deleting drives, if necessary and by changing
* the drive record states from ADD->OK if master has drive
* marked OK and slave has drive marked ADD.
*
* Interesting scenarios:
*
* 1) System has 4 nodes with node 1 as the master. Node 3 starts
* to delete a drive record (drive record on node 1 is marked DEL),
* but is stopped when node 3 panics. Node 1 also panics.
* During reconfig cycle, node 2 is picked as master and the drive
* record is left alone since all nodes in the cluster have it
* marked OK. User now sees drive as part of diskset.
* Now, entire cluster is rebooted and node 1 rejoins the cluster.
* Node 1 is picked as the master and node 1 has drive record
* marked DEL. Node 1 contacts all other nodes in the cluster
* and since at least one node has the drive record marked OK,
* the master marks the drive record OK.
* User continues to see the drive as part of the diskset.
*/
/* Reget set descriptor since flushed above */
rval = -1;
goto out;
}
/* Has side effect of setting sd->sd_drvs to same as master_dd */
/* No drives in list */
/*
* Can't get drive list for this node, so
* return -1 causing this node to be removed
* cluster config and fixed.
*/
rval = -1;
goto out;
}
}
/* Count the number of drives for all nodes */
while (mnsr_node) {
dr_cnt = 0;
while (dr) {
dr_cnt++;
}
}
/* Count the number of drives for the master; also check flags */
all_drives_ok = 1;
dd_cnt = 0;
while (dd) {
dd_cnt++;
all_drives_ok = 0;
}
/* If all drives are ok, do quick check against number of drives */
if (all_drives_ok) {
/* If all nodes have same number of drives, almost done */
while (mnsr_node) {
break;
}
/* All nodes have same number of drives, just send flags */
goto send_drive_list;
}
}
"Begin detailed drive synchronization for set %s: %s"),
/* Detailed check required */
while (mnsr_node) {
/* Does slave node have less drives than master? */
/* Yes - must determine which drive is missing */
/* RPC failure to !my node */
if ((mdanyrpcerror(ep)) &&
!= 0)) {
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
"Master node %s unable to "
"retrieve drive list from node %s"),
goto out;
}
while (dd) {
continue;
}
while (other_dd) {
/* Convert to devids, when available */
break;
}
}
/*
* dd not found on slave so mark it
* ADD for later deletion (drives in ADD
* state are deleted later in this routine).
*/
}
}
}
}
"Drive check completed for set %s: %s"),
dd_prev = 0;
while (dd) {
/* Remove any ADD drives from list */
if (dd_prev) {
} else {
/*
* If removing drive descriptor from head
* of linked list, also change sd->sd_drvs.
*/
}
continue;
}
/*
* If drive is marked DEL, check all other nodes.
* If drive on another node is marked OK, mark drive OK
* in master list. If drive is marked DEL or doesn't exist
* on all nodes, remove drive from list.
*/
while (mnsr_node) {
if (clnt_getdrivedesc(
/* RPC failure to !my node */
if ((mdanyrpcerror(ep)) &&
!= 0)) {
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
"Master node %s unable "
"to retrieve drive list from "
"node %s"), mynode(),
goto out;
}
}
while (other_dd) {
/* Found drive (OK) from other node */
== 0) {
/* Drive marked OK */
MD_DR_OK) {
}
break;
}
}
break;
}
/*
* If no node had this drive marked OK, delete it.
*/
if (dd_prev) {
} else {
/*
* If removing drive descriptor from
* head of linked list, also change
* sd->sd_drvs.
*/
}
continue;
}
}
}
"Setting drive states completed for set %s: %s"),
/*
* Set genid on all drives to be the highest value seen.
*/
while (dd) {
}
/*
* Send updated drive list to all alive nodes.
* Will also set genid on set and node records to have same
* as the drive records.
*/
while (nd) {
/* Skip non-alive nodes */
continue;
}
/* RPC failure to another node */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
}
"Sent drive list to all nodes for set %s: %s"),
/*
* If no drive records left in set and nodes had been joined,
* withdraw the nodes. Always reset the master and mark
* all nodes as withdrawn on all nodes.
*/
/* Reset new master flag since no longer master */
/* Use magic to help protect ioctl against attack. */
/* Ignore failure, failure to reset flag isn't catastrophic */
"Reset new master flag for " "set %s: %s"),
while (nd) {
/* Skip non-alive nodes */
continue;
}
/* RPC failure to another node */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
set_locked = 1;
/* Withdraw node from set if owner */
/* RPC failure to another node */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
/* Mark all nodes as withdrawn on this node */
/* RPC failure to another node */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
/* Resets master to no-master on this node */
/* RPC failure to another node */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
/* RPC failure to another node */
if ((mdanyrpcerror(ep)) &&
rval = 205;
} else {
/* Any other failure */
rval = -1;
}
goto out;
}
set_locked = 0;
}
}
out:
/*
* If got here and set is still locked, then an error has
* occurred and master_nodelist is still valid.
* If error is not an RPC error, then unlock.
* If error is an RPC error, skip unlocks since this could cause
* yet another RPC timeout if a node has failed.
* Ignore failures in unlock since unlock is just trying to
* clean things up.
*/
while (nd) {
/* Skip non-alive nodes */
continue;
}
/*
* If clnt_unlock fails, just break out since next
* reconfig cycle will reset the locks anyway.
*/
break;
}
}
}
/* Free master_mnsr and drive descs */
while (mnsr_node) {
}
/* Frees sd->sd_drvs (which is also master_dd) */
return (rval);
}
/*
* meta_mnsync_diskset_mddbs
* Calling node is guaranteed to be an owner node.
* Calling node is the master node.
*
* Master node verifies that ondisk mddb format matches its incore format.
* If no nodes are joined to set, remove the change log entries.
* If a node is joined to set, play the change log.
*
* Returns 0 - Success
* 1 - Master unable to join to set.
* 205 - Failure during RPC to another node
* -1 - Any other failure and ep is filled in.
* -1 return will eventually cause node to panic
* in a SunCluster environment.
*/
int
)
{
int stale_set = 0;
/* If setname is there, set desc should exist. */
return (-1);
}
/* Are there drives in the set? */
return (-1);
}
/* No drives in set -- nothing to sync up */
return (0);
}
/*
* Is master node (which is this node) joined to set?
* If master node isn't joined (which means that no nodes
* are joined to diskset), remove the change log entries
* since no need to replay them - all nodes will have same
* view of mddbs since all nodes are reading in the mddbs
* from disk.
* There is also no need to sync up the master and ondisk mddbs
* since master has no incore knowledge.
* Need to join master to set in order to flush the change
* log entries. Don't need to block I/O during join of master
* to set since no other nodes are joined to set and so no I/O
* can be occurring.
*/
/* Join master to set */
MNSET_IN_RECONFIG, ep)) {
/*
* If STALE, print message and continue on.
* Don't do any writes or reads to mddbs
* so don't clear change log.
*/
"Join of master node to STALE set %s"),
stale_set = 1;
mdclrerror(ep);
/* ACCOK means mediator provided extra vote */
mdclrerror(ep);
} else {
/*
* If master is unable to join set, print an
* error message. Don't return failure or node
* will panic during cluster reconfig cycle.
* Also, withdraw node from set in order to
* cleanup from failed join attempt.
*/
"Join of master node in set %s failed"),
mdclrerror(&xep);
return (1);
}
}
/*
* Master node successfully joined.
* Set local copy of flags to OWN and
* send owner flag to rpc.metad. If not stale,
* flush the change log.
*/
MNSET_IN_RECONFIG, ep)) {
"Flag update of master node join in set %s failed"),
return (-1);
}
if (!stale_set) {
MDMN_CLF_RESETLOG) != 0) {
"Unable to reset changelog."));
return (-1);
}
"Removed changelog entries for set %s: %s"),
}
/* Reset new master flag before return */
/* Use magic to help protect ioctl against attack. */
/* Ignore failure, failure to reset flag isn't catastrophic */
"Reset new master flag for set %s: %s"),
return (0);
}
/*
* Is master already joined to STALE set (< 50% mddbs avail)?
* If so, can make no config changes to mddbs so don't check or play
* changelog and don't sync master node to ondisk mddbs.
* To get out of the stale state all nodes must be withdrawn
* from set. Then as nodes are re-joined, all nodes will
* have same view of mddbs since all nodes are reading the
* mddbs from disk.
*/
(void) memset(&c, 0, sizeof (c));
c.c_id = 0;
return (-1);
}
if (c.c_flags & MDDB_C_STALE) {
return (0);
}
/*
* If this node is NOT a newly chosen master, then there's
* nothing else to do since the change log should be empty and
* the ondisk and incore mddbs are already consistent.
*
* A newly chosen master is a node that was not the master
* at the beginning of the reconfig cycle. If a node is a new
* master, then the new master state is reset after the ondisk
* and incore mddbs are consistent and the change log has
* been replayed.
*/
/* Use magic to help protect ioctl against attack. */
return (0);
}
/*
* Now, sync up incore master view to ondisk mddbs.
* This is needed in the case where a master node
* had made a change to the mddb, but this change
* may not have been relayed to the slaves yet.
* So, the new master needs to verify that the ondisk
* mddbs match what the new master has incore -
* if different, new master rewrites all of the mddbs.
* Then the new master will replay the changelog and the
* new master will then execute what the old master had
* done.
*
* the diskset. This will allow the rewriting of the mddbs
* (if needed), to proceed in a timely manner.
*
*/
while (nd) {
/* Skip non-alive and non-owner nodes */
continue;
}
MN_SUSP_IO, ep)) {
"Unable to suspend I/O on node %s in set %s"),
/*
* Resume all other nodes that had been suspended.
* for all sets.)
*/
while (nd2) {
/* Stop when reaching failed node */
break;
/* Skip non-alive and non-owner nodes */
continue;
}
}
/*
* If an RPC failure on another node, return a 205.
* Otherwise, exit with failure.
*/
if ((mdanyrpcerror(ep)) &&
return (205);
} else {
return (-1);
}
}
}
(void) memset(&c, 0, sizeof (c));
c.c_id = 0;
/* Master can't sync up to ondisk mddbs? Kick it out of cluster */
return (-1);
/*
*/
while (nd) {
/* Skip non-alive and non-owner nodes */
continue;
}
"Unable to resume I/O on node %s in set %s"),
/*
* If an RPC failure then don't do any
* more RPC calls, since one timeout is enough
* to endure. If RPC failure to another node, return
* 205. If RPC failure to my node, return -1.
* If not an RPC failure, continue resuming the
* rest of the nodes and then return -1.
*/
if (mdanyrpcerror(ep)) {
return (-1);
} else {
return (205);
}
}
/*
* If not an RPC error, continue resuming rest of
* nodes, ignoring any failures except for an
* RPC failure which constitutes an immediate exit.
* Start in middle of list with failing node.
*/
while (nd2) {
/* Skip non-alive and non-owner nodes */
continue;
}
if (mdanyrpcerror(&xep)) {
return (-1);
}
}
}
}
/*
* Send (aka replay) all messages we find in the changelog.
* Flag the messages with
* MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
* MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
*/
int ret;
/* no entry for this class */
continue;
}
"replaying message ID=(%d, 0x%llx-%d)\n"),
&resultp,
&xep);
"mdmn_send_message returned %d\n"), ret);
if (resultp)
}
"Playing changelog completed for set %s: %s"),
/*
* Now that new master has ondisk and incore mddbs in sync, reset
* this node's new master kernel flag (for this set). If this node
* re-enters another reconfig cycle before the completion of this
* reconfig cycle, this master node won't need to check if the ondisk
* and incore mddbs are in sync since this node won't be considered
* a new master (since this flag is being reset here in the middle of
* step2). This will save time during any subsequent reconfig
* cycles as long as this node continues to be master.
*/
/* Use magic to help protect ioctl against attack. */
/* Ignore failure, since failure to reset flag isn't catastrophic */
"Reset new master flag for set %s: %s"),
return (0);
}
/*
* meta_mnjoin_all will join all starting nodes in the diskset.
* A starting node is considered to be any node that is not
* an owner of the set but is a member of the cluster.
* Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
*
* Caller is the Master node.
*
* Returns 0 - Success
* 205 - Failure during RPC to another node
* -1 - Any other failure and ep is filled in.
*/
int
)
{
int rval = 0;
int stale_flag = 0;
int susp_res_flag = 0;
/* If setname is there, set desc should exist. */
return (-1);
}
/* Are there drives in the set? */
return (-1);
}
/* No drives in set -- nothing to join */
return (0);
}
/*
* Is set currently stale?
*/
(void) memset(&c, 0, sizeof (c));
c.c_id = 0;
/* Ignore failure since master node may not be joined yet */
if (c.c_flags & MDDB_C_STALE) {
}
/*
* If any nodes are going to be joined to diskset, then
* suspend I/O to all disks in diskset so that nodes can join
* (read in mddbs) in a reasonable amount of time even under
* high I/O load. Don't need to do this if set is STALE since
* no I/O can be occurring to a STALE set.
*/
if (stale_flag != MNSET_IS_STALE) {
while (nd) {
/* Found a node that will be joined to diskset */
/* Set flag that diskset should be suspended */
susp_res_flag = 1;
break;
}
}
}
if (susp_res_flag) {
/*
* nodes in the diskset.
* node, return 205; otherwise, return -1.
*/
while (nd) {
/* Skip non-alive and non-owner nodes */
continue;
}
MN_SUSP_IO, ep)) {
"Unable to suspend I/O on node %s"
/*
* Resume other nodes that had been suspended.
* for all sets.)
*/
while (nd2) {
/* Stop when reaching failed node */
break;
MD_MN_NODE_ALIVE)) ||
MD_MN_NODE_OWN))) {
continue;
}
(void) (clnt_mn_susp_res_io(
}
/*
* If the suspend failed due to an
* RPC failure on another node, return
* a 205.
* Otherwise, exit with failure.
* The return reconfig step will resume
*/
if ((mdanyrpcerror(ep)) &&
return (205);
} else {
return (-1);
}
}
}
}
while (nd) {
/*
* If a node is in the membership list but isn't joined
* to the set, try to join the node.
*/
/*
* If RPC failure to another node
* then exit without attempting anything else.
* for all sets.)
*/
if (mdanyrpcerror(ep)) {
return (205);
}
/*
* STALE and ACCOK failures aren't true
* failures. STALE means that <50% mddbs
* are available. ACCOK means that the
* mediator provided the extra vote.
* If a true failure, then print messasge
* and withdraw node from set in order to
* cleanup from failed join attempt.
*/
"WARNING: Unable to join node %s "
mdclrerror(ep);
mdclrerror(&xep);
continue;
}
}
/* Set owner flag even if STALE or ACCOK */
}
}
/*
*/
if (susp_res_flag) {
while (nd) {
/*
* Skip non-alive and non-owner nodes
* (this list doesn't include any of
* the nodes that were joined).
*/
continue;
}
"Unable to resume I/O on node %s"
/*
* If an RPC failure then don't do any
* more RPC calls, since one timeout is enough
* to endure. If RPC failure to another node,
* return 205. If RPC failure to my node,
* return -1.
* for all sets.)
* If not an RPC failure, continue resuming the
* rest of the nodes and then return -1.
*/
if (mdanyrpcerror(ep)) {
return (-1);
} else {
return (205);
}
}
/*
* If not an RPC error, continue resuming rest
* of nodes, ignoring any failures except for
* an RPC failure which constitutes an
* immediate exit.
* Start in middle of list with failing node.
*/
while (nd2) {
/* Skip non-owner nodes */
MD_MN_NODE_ALIVE)) ||
MD_MN_NODE_OWN))) {
continue;
}
(void) (clnt_mn_susp_res_io(
if (mdanyrpcerror(&xep)) {
return (-1);
}
}
}
}
}
while (nd) {
continue;
}
/*
* If 1 node fails - go ahead and update the rest except
* in the case of an RPC failure, fail immediately.
*/
/* RPC failure to another node */
if (mdanyrpcerror(ep)) {
return (205);
}
rval = -1;
continue;
}
}
"Join of all nodes completed for set %s: %s"),
return (rval);
}