meta_mn_handlers.c revision 2
2N/A * The contents of this file are subject to the terms of the 2N/A * Common Development and Distribution License (the "License"). 2N/A * You may not use this file except in compliance with the License. 2N/A * See the License for the specific language governing permissions 2N/A * and limitations under the License. 2N/A * When distributing Covered Code, include this CDDL HEADER in each 2N/A * If applicable, add the following below this CDDL HEADER, with the 2N/A * fields enclosed by brackets "[]" replaced with your own identifying 2N/A * information: Portions Copyright [yyyy] [name of copyright owner] 2N/A * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 2N/A * Use is subject to license terms. 2N/A/* we reserve 1024 bytes for stdout and the same for stderr */ 2N/A#
define JUNK 128 /* used to flush stdout and stderr */ 2N/A * We are given one string containing all the arguments 2N/A * For execvp() we have to regenerate the arguments again 2N/A int arg;
/* argument that is currently been built */ 2N/A int i;
/* helper for for loop */ 2N/A char *
cp;
/* runs through the given command line string */ 2N/A /* init the args array alloc the first one and null out the rest */ 2N/A "PANIC: too many arguments specified\n"));
2N/A "PANIC: argument too long\n"));
2N/A if ((*
cp !=
' ') && (*
cp !=
'\t')) {
2N/A * No space or tab: copy char into current 2N/A * argv and advance both pointers 2N/A cp++;
/* next char in command line */ 2N/A * space or tab: terminate current argv, 2N/A * advance arg, reset pointer into arg, 2N/A * advance pointer in command line 2N/A cp++;
/* next char in command line */ 2N/A /* terminate the last real argument */ 2N/A /* the last argument is an NULL pointer */ 2N/A "PANIC: pipe failed\n"));
2N/A "PANIC: pipe failed\n"));
2N/A "PANIC: fork failed\n"));
2N/A /* close the reading channels of pout and perr */ 2N/A /* redirect stdout */ 2N/A "PANIC: dup2 failed\n"));
2N/A /* redirect stderr */ 2N/A "PANIC: dup2 failed\n"));
2N/A /* parent process */ 2N/A char *
out, *
err;
/* for stdout and stderr of child */ 2N/A int i;
/* index into the aboves */ 2N/A /* close the writing channels of pout and perr */ 2N/A * Did the child produce some output to stdout? 2N/A * If so, read it until we either reach the end of the 2N/A * output or until we read MAX_OUT bytes. 2N/A * Whatever comes first. 2N/A * In case we already read MAX_OUT bytes we simply 2N/A * read away the output into a junk buffer. 2N/A * Just to make the child happy 2N/A /* buffer full, empty stdout */ 2N/A /* stdout is closed by child */ 2N/A /* buffer full, empty stderr */ 2N/A /* stderr is closed by child */ 2N/A * This is for checking if a metadevice is opened, and for 2N/A * locking in case it is not and for 2N/A * unlocking a locked device 2N/A * In case the ioctl succeeded, return the open state of 2N/A * the metadevice. Otherwise we return the error the ioctl 2N/A * produced. As this is not zero, no attempt is made to 2N/A * When doing a metaclear, one node after the other 2N/A * does the two steps: 2N/A * - check on all nodes if this md is opened. 2N/A * - remove the md locally. 2N/A * When the 2nd node asks all nodes if the md is 2N/A * open it starts with the first node. 2N/A * As this already removed the md, the check 2N/A * returns MDE_UNIT_NOT_SETUP. 2N/A * In order to not keep the 2nd node from proceeding, 2N/A * we map this to an Ok. 2N/A/* handler for MD_MN_MSG_REQUIRE_OWNER */ 2N/A /* Retry ownership change if we get EAGAIN returned */ 2N/A * handler for MD_MN_MSG_CHOOSE_OWNER 2N/A * This is called when a mirror resync has no owner. The master node generates 2N/A * this message which is not broadcast to the other nodes. The message is 2N/A * required as the kernel does not have access to the nodelist for the set. 2N/A * The node to be chosen will be the resync count for the set 2N/A * modulo the number of live nodes in the set 2N/A "MD_MN_MSG_CHOOSE_OWNER: Invalid setno %d\n"),
setno);
2N/A "MD_MN_MSG_CHOOSE_OWNER: Invalid set pointer\n"));
2N/A /* Count the number of live nodes */ 2N/A * If we've been called with msg_chooseid_set_node set TRUE then we 2N/A * are simply re-setting the owner id to ensure consistency across 2N/A * If the flag is reset (B_FALSE) we are requesting a new owner to be 2N/A /* scan the nodelist looking for the required node */ 2N/A /* Send message to all nodes to make ownership change */ 2N/A /* inherit some flags from the parent message */ 2N/A * Handler for MD_MN_MSG_CHANGE_OWNER 2N/A * This is called when we are perfoming a resync and wish to change from 2N/A * no mirror owner to an owner chosen by the master. 2N/A * This mesage is only relevant for the new owner, the message will be 2N/A * ignored by all other nodes 2N/A "MD_MN_MSG_CHANGE_OWNER: Invalid setno %d\n"),
setno);
2N/A "MD_MN_MSG_CHANGE_OWNER: Invalid set pointer\n"));
2N/A * If we are the chosen owner, issue ioctl to make the 2N/A * Single shot at changing the the owner, if it fails EAGAIN, 2N/A * another node must have become the owner while we are in the 2N/A * process of making this choice. 2N/A/* handler for MD_MN_MSG_SUSPEND_WRITES */ 2N/A /* Suspend writes to a region of a mirror */ 2N/A * handler for MD_MN_MSG_STATE_UPDATE_RESWR 2N/A * This functions update a submirror component state and then resumes writes 2N/A /* Update the state of the component of a mirror */ 2N/A * submessage generator for MD_MN_MSG_STATE_UPDATE and MD_MN_MSG_STATE_UPDATE2 2N/A * This generates 2 messages, the first is SUSPEND_WRITES and 2N/A * depending on the type of the original message the second one is 2N/A * either STATE_UPDATE_RESWR or STATE_UPDATE_RESWR2 which actually does 2N/A * the same, but runs on a higher class. 2N/A return (
2);
/* Return the number of submessages generated */ 2N/A * handler for MD_MN_MSG_ALLOCATE_HOTSPARE and MD_MN_MSG_ALLOCATE_HOTSPARE2 2N/A * This sends a message to all nodes requesting them to allocate a hotspare 2N/A * for the specified component. The component is specified by the mnum of 2N/A * the mirror, the submirror index and the component index. 2N/A /* Allocate a hotspare for a mirror component */ 2N/A * handler for MD_MN_MSG_RESYNC_STARTING,MD_MN_MSG_RESYNC_FIRST, 2N/A * MD_MN_MSG_RESYNC_NEXT, MD_MN_MSG_RESYNC_FINISH, MD_MN_MSG_RESYNC_PHASE_DONE 2N/A * Prior to running the resync thread first check that the start_step 2N/A * flag (MD_SET_MN_START_RC) added by metaclust's MC_START step has been 2N/A * removed from the set record flags. Ordinarily, this would be removed 2N/A * at MC_STEP4 in metaclust - need to ensure this has happened on all 2N/A /* Use magic to help protect ioctl against attack. */ 2N/A "MDMN_DO_RESYNC: Invalid setno = %d\n"),
2N/A /* start_flag always true initially */ 2N/A "MDMN_DO_RESYNC: Could not get start_step " 2N/A "flag for set %s - returning\n"),
2N/A /* metaioctl returns successfully - is start flag cleared? */ 2N/A "MDMN_DO_RESYNC: Waiting for start_step " 2N/A "flag for set %s to be cleared\n"),
2N/A "MDMN_DO_RESYNC: Could not clear " 2N/A "start_step flag for set %s " 2N/A * handler for MD_MN_MSG_SETSYNC 2N/A * handler for MD_MN_MSG_SET_CAP. As this handler can deal with both mirrors 2N/A * and soft partitions, the driver name that is required for the ioctl call 2N/A * is included in the message. 2N/A * Dummy handler for various CLASS0 messages like 2N/A * MD_MN_MSG_VERBOSITY / MD_MN_MSG_RESUME / MD_MN_MSG_SUSPEND ... 2N/A * Overall description of mdcommd support that keeps all nodes in-sync 2N/A * with the ondisk diskset mddbs. 2N/A * or replicas must use a CLASS1 message to block out these changes. 2N/A * Changes to the state of existing replicas do not need to block CLASS1 2N/A * since there is no conflict when just updating the state of a replica. 2N/A * Error encountered when master writes to mddbs: 2N/A * As the master updates parts of the mddbs, flags are updated describing 2N/A * what has been written. When all locks are dropped (either in 2N/A * mddb_setexit or mdioctl), a PARSE message will be generated to all 2N/A * nodes with an index list of known good mddbs and the parse flags. 2N/A * The master node ignore the parse message since it sent it. 2N/A * The slave nodes re-read in the changed part of the mddb using the list 2N/A * of known good replicas that was passed. 2N/A * PARSE message does not block CLASS1. 2N/A * The PARSE message must be the highest class message. Since this 2N/A * message could be sent on any ioctl, this PARSE message class must 2N/A * be higher than any other class message that could issue an ioctl. 2N/A * Master Slave1 Slave2 2N/A * metadb -s set_name -a/-d 2N/A * metaset -s set_name -a/-d disk 2N/A * metaset -s set_name -b 2N/A * messages on all nodes until this message is finished. The master 2N/A * The BLOCK message is only run on the master node and will BLOCK 2N/A * the PARSE messages from being sent to the nodes. 2N/A * removes the replica(s) from the given disk slice. 2N/A * The UNBLOCK message is only run on the master node and allows the 2N/A * sending of PARSE messages. 2N/A * Master Slave1 Slave2 2N/A * ATTACH msg to master 2N/A * ATTACH ATTACH ATTACH 2N/A * ATTACH msg finished 2N/A * Add/Delete host side information from the following commands: 2N/A * metaset -s set_name -a/-d -h 2N/A * The metaset command is run on the node executing the command and 2N/A * message whenever a host is added to or deleted from the diskset. 2N/A * The side information contains the major name and minor number 2N/A * associated with a disk slice from a certain node's perspective 2N/A * in an (failed) effort to support clustered systems that don't have the 2N/A * same device name for a physical device. (The original designers of 2N/A * SVM eventually took the shortcut of assuming that all device names 2N/A * are the same on all systems, but left the side information in the 2N/A * mddb and namespace.) The side information is used for disk slices 2N/A * that contain mddbs and/or are components for metadevices. 2N/A * for each mddb for the host being added or deleted. 2N/A * for all disk slice components that are in the namespace records for 2N/A * the host being added or deleted. 2N/A * and only needs to be executed on the master node since the slave 2N/A * nodes will be brought up to date by the PARSE message that is 2N/A * generated as a result of a change to the mddb. 2N/A * and needs to be run on all nodes. The message must block class1 2N/A * messages so that record changing commands don't interfere. 2N/A * Master Slave1 Slave2 2N/A * DB_NEWSIDE msg to master 2N/A * DB_NEWSIDE msg finished 2N/A * MD_NEWSIDE msg to master 2N/A * MD_NEWSIDE MD_NEWSIDE MD_NEWSIDE 2N/A * MD_NEWSIDE msg finished 2N/A * Optimized resync record failure: 2N/A * When any node sees a failure to write an optimized resync record 2N/A * that node notifies the master node of the replica that failed. 2N/A * The master node handles the error and updates the rest of the 2N/A * nodes using a PARSE message. The PARSE message also calls 2N/A * fixoptrecord on each slave node causing each node to fix up 2N/A * the optimized resync records that are owned by that node (the mirror 2N/A * owner code also sets the optimized resync record owner). The master 2N/A * node will fix up all optimized resync records that have no owner or 2N/A * are owned by the master node. 2N/A * Master Slave1 Slave2 2N/A * Optimized Record Failure 2N/A * OPTRECERR msg to master 2N/A * Master handles opt rec failure 2N/A * OPTRECERR msg finished 2N/A * Slave rewrites optimized record 2N/A * Handler for MD_MN_MSG_MDDB_PARSE which send parse messages to the 2N/A * slave nodes in order to keep the incore view of the mddbs the 2N/A * same on all nodes. 2N/A * Since master node generated the mddb parse message, do nothing 2N/A * if this is the master node. 2N/A * If this is a slave node, send the parse message down to the kernel 2N/A * where this node will re-read in parts of the mddbs. 2N/A * Handler for MD_MN_MSG_MDDB_BLOCK which blocks the generation 2N/A * of parse messages from this node. 2N/A * slave node is unable to handle a parse message until the slave node 2N/A * then unblock the parse messages which causes the parse message to 2N/A * be sent to all nodes. 2N/A * Submessage generator for MD_MN_MSG_META_DB_ATTACH which generates 2N/A * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_ATTACH 2N/A * message on all nodes and then an UNBLOCK message on the master only. 2N/A /* Don't log submessages and panic on inconsistent results */ 2N/A return (
3);
/* Return the number of submessages generated */ 2N/A * Submessage generator for MD_MN_MSG_META_DB_DETACH which generates 2N/A * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_DETACH 2N/A * message on all nodes and then an UNBLOCK message on the master only. 2N/A /* Don't log submessages and panic on inconsistent results */ 2N/A return (
3);
/* Return the number of submessages generated */ 2N/A * Handler for MD_MN_MSG_SM_MDDB_ATTACH which is used to attach mddbs. 2N/A * Used when running: 2N/A * metadb -s set_name -a 2N/A * metaset -s set_name -a/-d disk 2N/A * metaset -s set_name -b 2N/A * All nodes in MN diskset must do meta_check_replica 2N/A * since this causes the shared namespace to be 2N/A * populated by the md driver names while checking 2N/A * to see if this device is already in use as a 2N/A /* If newdev was successful, continue with attach */ 2N/A * Handler for MD_MN_MSG_SM_MDDB_DETACH which is used to detach mddbs. 2N/A * Used when running: 2N/A * metadb -s set_name -d 2N/A * metaset -s set_name -a/-d disk 2N/A * metaset -s set_name -b 2N/A /* Found a match - delete mddb */ 2N/A /* Not incrementing "i" intentionally (dbcnt is changed) */ 2N/A * Handler for MD_MN_MSG_META_DB_NEWSIDE which is used to update the 2N/A * side information for each diskset mddb when a new host has been 2N/A * added to the diskset. The side information is the /dev/dsk/ctds name 2N/A * that the new node would use to access each mddb. 2N/A * Since this routine makes no changes to the records in the diskset mddb, 2N/A * this routine only needs to be run on the master node. The master node's 2N/A * kernel code will detect that portions of the mddb have changed and 2N/A * will send a parse message to all nodes to re-parse parts of the mddb. 2N/A * Used when running: 2N/A * metaset -s set_name -a -h new_hostname 2N/A * Handler for MD_MN_MSG_META_DB_DELSIDE which is used to remove the 2N/A * side information for each diskset mddb when a host has been 2N/A * deleted from the diskset. The side information is the /dev/dsk/ctds name 2N/A * that the node would use to access each mddb. 2N/A * Since this routine makes no changes to the records in the diskset mddb, 2N/A * this routine only needs to be run on the master node. The master node's 2N/A * kernel code will detect that portions of the mddb have changed and 2N/A * will send a parse message to all nodes to re-parse parts of the mddb. 2N/A * Used when running: 2N/A * metaset -s set_name -d -h hostname 2N/A * Handler for MD_MN_MSG_META_MD_ADDSIDE which is used to add the 2N/A * side information for each diskset metadevice component (if that 2N/A * component is a disk) when a host has been added to the diskset. 2N/A * The side information is the /dev/dsk/ctds name that the node would 2N/A * use to access the metadevice component. 2N/A * This routine makes changes to the mddb records and must be run 2N/A * Used when running: 2N/A * metaset -s set_name -a -h new_hostname 2N/A /* While loop continues until IOCNXTKEY_NM gives nm.key of KEYWILD */ 2N/A /* Normal exit path is to eventually get a KEYWILD */ 2N/A * Okay we have a valid key 2N/A * Let's see if it is hsp or not 2N/A * If it is hsp add here 2N/A * The device reference count can be greater than 1 if 2N/A * more than one softpart is configured on top of the 2N/A * same device. If this is the case then we want to 2N/A * increment the count to sync up with the other sides. 2N/A * Handler for MD_MN_MSG_META_MD_DELSIDE which is used to delete the 2N/A * side information for each diskset metadevice component (if that 2N/A * component is a disk) when a host has been removed from the diskset. 2N/A * The side information is the /dev/dsk/ctds name that the node would 2N/A * use to access the metadevice component. 2N/A * This routine makes changes to the mddb records and must be run 2N/A * Used when running: 2N/A * metaset -s set_name -d -h hostname 2N/A /* Normal exit path is to eventually get a KEYWILD */ 2N/A * The device reference count can be greater than 1 if 2N/A * more than one softpart is configured on top of the 2N/A * same device. If this is the case then we want to 2N/A * decrement the count to zero so the entry can be 2N/A * Handler for MD_MN_MSG_MDDB_OPTRECERR which is used to notify 2N/A * the master node that a node has seen an error when attempting to 2N/A * write to the optimized resync records that reside on 2 of the diskset 2N/A * mddbs. Master node will mark the failed replica in error and this 2N/A * will send a parse message to all nodes to re-read parts of the mddb 2N/A * and to fix their optimized resync records based on this information. 2N/A for (i = 0; i <
2; i++) {
2N/A return (
4);
/* Return the number of submessages generated */ 2N/A * This is to send an MD_IOCSET ioctl to all nodes to create a soft 2N/A "MD_MN_MSG_IOCSET: Invalid setno %d\n"),
setno);
2N/A * Device should be in the namespace already 2N/A "MD_MN_MSG_IOCSET: Invalid mnum %d\n"),
2N/A * Create unit structure 2N/A * This is to update the status of a softpart 2N/A "MD_MN_MSG_IOCSET: Invalid setno %d\n"),
setno);
2N/A * This is to add a key to the namespace 2N/A "MD_MN_ADDKEYNAME: Invalid setno %d\n"),
setno);
2N/A * This is to delete a key from the namespace 2N/A "MD_MN_DELKEYNAME: Invalid setno %d\n"),
setno);
2N/A * Reset the key value for the name. This is required because 2N/A * any previous call of del_key_name for the same component 2N/A * will have resulted in the key value being reset to MD_KEYBAD 2N/A * even though there may still be references to this component. 2N/A * This is to get the value of tstate from the master node. We use this 2N/A * to get the ABR state of a metadevice from the master. 2N/A * This is to get the mirror ABR state and the state of its submirrors from 2N/A * the master node. We need this to ensure consistent output from metastat 2N/A * when a new node joins the cluster during a resync. Without this the 2N/A * submirror status will be incorrect until the whole resync is complete which 2N/A * may take days for very large metadevices. 2N/A /* Validate set information from minor number */ 2N/A "MD_MN_GET_MIRROR_STATE: Invalid set %d\n"),
setno);
2N/A /* Construct mirror name from minor number */ 2N/A "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
2N/A /* Get common mirror structure */ 2N/A "MD_MN_GET_MIRROR_STATE: Invalid mirror minor %x\n"),
2N/A "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
2N/A * gets passed back to the message originator 2N/A /* Returm value of tstate for mirror */ 2N/A * This is to issue an ioctl to call poke_hotspares 2N/A * Called to create a softpart during a metarecover operation 2N/A "MD_MN_MSG_ADDMDNAME: Invalid setno %d\n"),
2N/A * If device node does not exist then init it 2N/A "MD_MN_MSG_ADDMDNAME: Invalid name %s\n"),
2N/A * This is used to issue a MD_MN_RR_DIRTY ioctl to the mirror. 2N/A * This is used to issue a MD_MN_RR_CLEAN ioctl to the mirror.