meta_import.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <assert.h>
#include <ctype.h>
#include <libdevinfo.h>
#include <mdiox.h>
#include <meta.h>
#include "meta_repartition.h"
#include "meta_set_prv.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct did_list {
void *rdid; /* real did if replicated set */
void *did; /* did stored in lb */
char *devname;
char *minor_name;
} did_list_t;
typedef struct replicated_disk {
void *old_devid;
void *new_devid;
struct replicated_disk *next;
/*
* The current implementation limits the max device id length to 256 bytes.
* Should the max device id length be increased, this define would have to
* be bumped up accordingly
*/
#define MAX_DEVID_LEN 256
/*
* We store a global list of all the replicated disks in the system. In
* order to prevent us from performing a linear search on this list, we
* store the disks in a two dimensional sparse array. The disks are bucketed
* based on the length of their device ids.
*/
/*
* The list of replicated disks is built just once and this flag is set
* once it's done
*/
static int replicated_disk_list_built = 0;
/*
* Map logical blk to physical
*
* This is based on the routine of the same name in the md kernel module (see
* file md_mddb.c), with the following caveats:
*
* - The kernel routine works on in core master blocks, or mddb_mb_ic_t; this
* routine works instead on the mddb_mb_t read directly from the disk
*/
static daddr_t
)
{
/*
* Sanity check: is the block within range? If so, we then assume
* that the block range map in the master block is valid and
* consistent with the block count. Unfortunately, there is no
* reliable way to validate this assumption.
*/
return ((daddr_t)-1);
}
/*
* drive_append()
*
* Append to tail of linked list of md_im_drive_info_t.
*
* Will allocate space for new node and copy args into new space.
*
* Returns pointer to new node.
*/
static md_im_drive_info_t *
void *devid,
void *rdevid,
int devid_sz,
char *minor_name,
)
{
int o_devid_sz;
;
/*
* If rdevid is not NULL then we know we are dealing with
* replicated diskset case. 'devid_sz' will always be the
* size of a valid devid which can be 'devid' or 'rdevid'
*/
if (rdevid) {
/*
* Also need to store the 'other' devid
*/
} else {
/*
* In the case of regular diskset, midp->mid_o_devid
* will be a NULL pointer
*/
}
return (midp);
}
/*
* drive_append_wrapper()
*
* Constant time append wrapper; the append function will always walk the list,
* this will take a tail argument and use the append function on just the tail
* node, doing the appropriate old-tail-next-pointer bookkeeping.
*/
static md_im_drive_info_t **
void *devid,
void *rdevid,
int devid_sz,
char *minor_name,
)
{
return (tailpp);
}
/*
* replica_append()
*
* Append to tail of linked list of md_im_replica_info_t.
*
* Will allocate space for new node and copy args into new space.
*
* Returns pointer to new node.
*/
static md_im_replica_info_t *
int flags,
)
{
;
return (mirp);
}
/*
* replica_append_wrapper()
*
* Constant time append wrapper; the append function will always walk the list,
* this will take a tail argument and use the append function on just the tail
* node, doing the appropriate old-tail-next-pointer bookkeeping.
*/
static md_im_replica_info_t **
int flags,
)
{
return (tailpp);
}
/*
* map_replica_disk()
*
* Searches the device id list for a specific
* disk based on the locator block device id array index.
*
* Returns a pointer to the did_list node if a match was
* found or NULL otherwise.
*/
static did_list_t *
int did_index
)
{
return (tailp);
}
/* not found, return failure */
return (NULL);
}
/*
* replicated_list_lookup()
*
* looks up a replicated disk entry in the global replicated disk list
* based upon the length of that disk's device id. returns the new device id
* for the disk.
* If you store the returned devid you must create a local copy.
*/
static void *
void *old_devid
)
{
return (NULL);
do {
return (NULL);
}
/*
* replicated_list_insert()
*
* inserts a replicated disk entry into the global replicated disk list
*/
static void
void *old_devid,
void *new_devid
)
{
void *repl_old_devid = NULL;
if (*first_entry == NULL) {
*first_entry = repl_disk;
return;
}
}
/*
* get_replica_disks()
*
* Will step through the locator records in the supplied locator block, and add
* each one with an active replica to a supplied list of md_im_drive_info_t, and
* add the appropriate replicas to the md_im_replica_info_t contained therein.
*/
static void
md_error_t *ep,
int replicated
)
{
int flags;
int devid_sz;
char *minor_name;
void *did;
on_list = 0;
/*
* search the device id list for a
* specific ctds based on the locator
* block device id array index.
*/
/*
* metadrivename() can fail for a slice name
* if there is not an existing mddrivename_t.
* So we use metadiskname() to strip the slice
* number.
*/
on_list = 1;
break;
}
}
/*
* Get the correct devid_sz
*/
if (replicated)
else
/*
* New on the list so add it
*/
if (!on_list) {
int fd = -1;
/* determine the replica slice */
ep) != 0) {
continue;
}
/*
* if the replica slice size is zero,
* don't bother opening
*/
continue;
}
continue;
}
continue;
}
/*
* a drive may not have a master block
*/
DEV_BSIZE) <= 0) {
mdclrerror(ep);
continue;
}
NULL);
}
/*
* For either of these assertions to fail, it implies
* a NULL return from metadrivename() above. Since
* the args came from a presumed valid locator block,
* that's Bad.
*/
/*
* Extract the parameters describing this replica.
*
* The magic "1" in the length calculation accounts
* for the length of the master block, in addition to
* the block count it describes. (The master block
* will always take up one block on the disk, and
* there will always only be one master block per
* replica, even though much of the code is structured
* to handle noncontiguous replicas.)
*/
/*
* If we're here it means -
*
* a) we had an active copy of the replica, and
* b) we've added the disk to the list of
* disks as well.
*
* We need to bump up the number of active
* replica count for each such replica so that it
* can be used later for replica quorum check.
*/
}
}
}
/*
* get_nonreplica_disks()
*
* Extracts the disks without replicas from the locator name space and adds them
* to the supplied list of md_im_drive_info_t.
*/
static void
md_error_t *ep,
int replicated
)
{
char *search_path = "/dev";
int on_list = 0;
int devid_sz;
struct devid_min_rec *did_rec;
struct devid_shr_rec *did_shr_rec;
struct did_shr_name *did;
struct did_min_name *min;
void *r_did; /* NULL if not a replicated diskset */
void *valid_did;
/*
* We got a pointer to an mddb record, which we expect to contain a
* name record; extract the pointer thereto.
*/
/* LINTED */
/* LINTED */
did_shr_rec = (struct devid_shr_rec *)
/*
* Skip the nm_rec_hdr and iterate on the array of struct minor_name
* at the end of the devid_min_rec
*/
/* LINTED */
on_list = 0;
/*
* For a give DID_NM key, locate the corresponding device
* id from DID_NM_SHR
*/
/* LINTED */
did = (struct did_shr_name *)
/*
* We got a match, this is the device id we're
* looking for
*/
break;
}
/* we didn't find a match */
}
/*
* If replicated diskset
*/
if (replicated) {
char *temp;
/*
* In this case, did->did_devid will
* be invalid so lookup the real one
*/
} else {
}
/* Get the ctds mapping for that device id */
/* Don't bother with metadevices, but track disks */
/* Is it already on the list? */
on_list = 1;
break;
}
}
if (!on_list) {
int fd = -1;
/* determine the replica slice */
ep) != 0) {
continue;
}
/*
* if the replica slice size is zero,
* don't bother opening
*/
== 0) {
continue;
}
continue;
}
continue;
}
/*
* a drive may not have a master block
*/
DEV_BSIZE) <= 0) {
mdclrerror(ep);
continue;
}
/*
* If it is replicated diskset,
* r_did will be non-NULL and
* devid_sz will be its size
*/
}
}
}
}
}
/*
* set_append()
*
* Append to tail of linked list of md_im_set_desc_t.
*
* Will allocate space for new node AND populate it by extracting disks with
* and without replicas from the locator blocks and locator namespace.
*
* Returns pointer to new node.
*/
static md_im_set_desc_t *
md_error_t *ep,
int replicated
)
{
/* run to end of list */
;
/* allocate new list element */
if (replicated)
/* Get the disks with and without replicas */
}
/*
* An error in this struct could come from either of the above routines;
* in both cases, we want to pass it back on up.
*/
return (misp);
}
/*
* set_append_wrapper()
*
* Constant time append wrapper; the append function will always walk the list,
* this will take a tail argument and use the append function on just the tail
* node, doing the appropriate old-tail-next-pointer bookkeeping.
*/
static md_im_set_desc_t **
md_error_t *ep,
int replicated
)
{
/* it's the first item in the list, return it instead of the next */
}
/*
* add_disk_names()
*
* Iterator to walk the minor node tree of the device snapshot, adding only the
* first non-block instance of each non-cdrom minor node to a list of disks.
*/
static int
{
char *search_path = "/dev";
/*
* skip CD devices
* If a device does not have a device id, we can't
* do anything with it so just exclude it from our
* list.
*
* This would also encompass CD devices and floppy
* devices that don't have a device id.
*/
return (DI_WALK_CONTINUE);
}
/* char disk devices (as opposed to block) */
/* only first occurrence (slice 0) of each instance */
sizeof (char *));
}
}
}
return (DI_WALK_CONTINUE);
}
/*
* meta_list_disks()
*
* Snapshots the device tree and extracts disk devices from the snapshot.
*/
int
{
== DI_NODE_NIL) {
}
return (0);
}
/*
* meta_imp_drvused
*
* Checks if given drive is mounted, swapped, part of disk configuration
* or in use by SVM. ep also has error code set up if drive is in use.
*
* Returns 1 if drive is in use.
* Returns 0 if drive is not in use.
*/
int
)
{
/*
* We pass in db_ep to meta_setup_db_locations
* and never ever use the error contained therein
* because all we're interested in is a check to
* see whether any local metadbs are present.
*/
(((meta_setup_db_locations(db_ep) == 0) &&
return (1);
} else {
return (0);
}
}
/*
* meta_prune_cnames()
*
* Removes in-use disks from the list prior to further processing.
*
* Return value depends on err_on_prune flag: if set, and one or more disks
* are pruned, the return list will be the pruned disks. If not set, or if no
* disks are pruned, the return list will be the unpruned disks.
*/
md_error_t *ep,
int err_on_prune
)
{
int d;
int fcount = 0;
/*
* Assuming we're interested in knowing about
* whatever error occurred, but not in stopping.
*/
mdclrerror(ep);
continue;
}
/*
* Check if the drive is inuse.
*/
fcount++;
mdclrerror(ep);
} else {
}
}
if (fcount) {
if (err_on_prune) {
return (fdnlp);
}
}
return (dnlp);
}
/*
* read_master_block()
*
* Returns:
* < 0 for failure
* 0 for no valid master block
* 1 for valid master block
*
* The supplied buffer will be filled in for EITHER 0 or 1.
*/
int
md_error_t *ep,
int fd,
void *bp,
int bsize
)
{
int rval = 1;
/*
* The master block magic number can either be MDDB_MAGIC_MB in
* the case of a real master block, or, it can be MDDB_MAGIC_DU
* in the case of a dummy master block
*/
rval = 0;
}
rval = 0;
}
return (rval);
}
/*
* read_locator_block()
*
* Returns:
* < 0 for failure
* 0 for no valid locator block
* 1 for valid locator block
*/
int
md_error_t *ep,
int fd,
void *bp,
int bsize
)
{
}
int
md_error_t *ep,
int fd,
void *bp,
int bcount
)
{
return (bcount);
}
/*
* read_locator_block_did()
*
* Returns:
* < 0 for failure
* 0 for no valid locator name struct
* 1 for valid locator name struct
*/
int
md_error_t *ep,
int fd,
void *bp,
int bsize
)
{
int rval;
return (rval);
}
/*
* read_locator_names()
*
* Returns:
* < 0 for failure
* 0 for no valid locator name struct
* 1 for valid locator name struct
*/
int
md_error_t *ep,
int fd,
void *bp,
int bsize
)
{
int rval;
return (rval);
}
int
md_error_t *ep,
int fd,
int dbblk,
void *bp,
int bsize
)
{
int rval;
return (rval);
}
int
md_error_t *ep,
int fd,
int didblk,
void *bp,
int bsize
)
{
int rval;
return (rval);
}
int
md_error_t *ep,
int fd,
int infoblk,
void *bp,
int bsize
)
{
int rval = 1;
return (rval);
}
/*
* meta_nm_rec()
*
* Return the DE corresponding to the requested namespace record type.
* Modifies dbp to have a firstentry if one isn't there.
*/
static mddb_de_t *
{
int desize;
/* LINTED */
+ sizeof (dbp->db_firstentry));
/* LINTED */
}
}
break;
}
return (dep);
}
/*
* read_nm_rec()
*
* Reads the NM, NM_DID or NM_DID_SHR record in the mddb and stores the
* configuration data in the buffer 'nm'
*
* Returns:
* < 0 for failure
* 0 for no valid NM/DID_NM/DID_NM_SHR record
* 1 for valid NM/DID_NM/DID_NM_SHR record
*
*/
static int
md_error_t *ep,
int fd,
char **nm,
char *diskname
)
{
/*LINTED*/
dbblk != 0;
sizeof (db))) <= 0)
return (rval);
/*
* Locate NM/DID_NM/DID_NM_SHR record. Normally there is
* only one record per mddb. There is a rare case when we
* can't expand the record. If this is the case then we
* will have multiple NM/DID_NM/DID_NM_SHR records linked
* with r_next_recid.
*
* For now assume the normal case and handle the extended
* namespace in Phase 2.
*/
break;
}
/* If meta_nm_rec() never succeeded, bail out */
return (0);
/* Read in the appropriate record and return configurations */
return (rval);
}
return (rval);
}
return (rval);
}
}
return (1);
}
/*
* is_replicated
*
* Determines whether a disk has been replicated or not. It checks to see
* if the device id stored in the master block is the same as the device id
* registered for that disk on the current system. If the two device ids are
* different, then we know that the disk has been replicated.
*
* If need_devid is set and the disk is replicated, fill in the new_devid.
* Also, if need_devid is set, this routine allocates memory for the device
* ids; the caller of this routine is responsible for free'ing up the memory.
*
* Returns:
* 1 if it's a replicated disk
* 0 if it's not a replicated disk
*/
static int
int fd,
int need_devid,
void **new_devid
)
{
int retval = 0;
return (retval);
return (retval);
retval = 1;
if (retval && need_devid) {
}
return (retval);
}
/*
* free_replicated_disks_list()
*
* this frees up all the memory allocated by build_replicated_disks_list
*/
static void
{
int index;
}
}
}
/*
* build_replicated_disks_list()
*
* Builds a list of disks that have been replicated using either a
* remote replication or a point-in-time replication software. The
* list is stored as a two dimensional sparse array.
*
* Returns
* 1 on success
* 0 on failure
*/
static int
md_error_t *ep,
)
{
int fd = -1;
void *new_devid;
/* determine the replica slice */
continue;
/*
* if the replica slice size is zero, don't bother opening
*/
continue;
continue;
/* a drive may not have a master block so we just continue */
mdclrerror(ep);
continue;
}
}
}
return (1);
}
/*
* free_did_list()
*
* Frees the did_list allocated as part of build_did_list
*/
static void
)
{
if (temp->minor_name)
}
}
/*
* build_did_list()
*
* Build a list of device ids corresponding to disks in the locator block.
* Memory is allocated here for the nodes in the did_list. The callers of
* this routine must also call free_did_list to free up the memory after
* they're done.
*
* Returns:
* < 0 for failure
* 0 for no valid locator block device id array
* 1 for valid locator block device id array
* ENOTSUP partial diskset, not all disks in a diskset on the
* system where import is being executed
*/
static int
md_error_t *ep,
int fd,
int replicated
)
{
char *search_path = "/dev";
char *minor_name;
uint_t did_info_length = 0;
uint_t did_info_firstblk = 0;
continue;
/*
* If we can re-use the buffer already has been
* read in then just use it. Otherwise free
* the previous one and alloc a new one
*/
if (bp)
(void *)bp, did_info_length)) < 0)
return (rval);
} else {
}
/*
* If we are not able to find the ctd mapping corresponding
* to a given device id, it probably means the device id in
* question is not registered with the system.
*
* Highly likely that the only time this happens, we've hit
* a case where not all the disks that are a part of the
* diskset were moved before importing the diskset.
*
* If set is a replicated diskset, then the device id we get
* from 'lb' will be the 'other' did and we need to lookup
* the real one before we call this routine.
*/
if (replicated) {
} else {
}
return (-1);
}
return (ENOTSUP);
}
}
/* Free the last bp */
if (bp)
return (1);
}
/*
* meta_get_set_info
*
* Scans a given drive for set specific information. If the given drive
* has a shared metadb, scans the shared metadb for information pertaining
* to the set.
*
* Returns:
* <0 for failure
* 0 success but no replicas were found
* 1 success and a replica was found
* ENOTSUP for partial disksets detected
*/
int
int local_mb_ok,
)
{
uint_t s;
int fd;
/*LINTED*/
/*LINTED*/
int lnsize, lbdid_size;
int rval = 0;
/*LINTED*/
struct devid_shr_rec *did_shrnmp;
struct devid_min_rec *did_nmp;
int extended_namespace = 0;
int replicated = 0;
/*
* Determine and open the replica slice
*/
return (-1);
}
/*
* Test for the size of replica slice in question. If
* the size is zero, we know that this is not a disk that was
* part of a set and it should be silently ignored for import.
*/
return (0);
return (-1);
}
/*
* After the open() succeeds, we should return via the "out"
* label to clean up after ourselves. (Up 'til now, we can
* just return directly, because there are no resources to
* give back.)
*/
goto out;
rval = 0;
goto out;
}
goto out;
/*
* Once the locator block has been read, we need to
* check if the locator block commit count is zero.
* If it is zero, we know that the replica we're dealing
* with is on a disk that was deleted from the disk set;
* and, it potentially has stale data. We need to quit
* in that case
*/
if (lbp->lb_commitcnt == 0) {
rval = 0;
goto out;
}
/*
* Make sure that the disk being imported has device id
* namespace present for disksets. If a disk doesn't have
* device id namespace, we skip reading the replica on that disk
*/
rval = 0;
goto out;
}
/*
* Grab the locator block device id array. Allocate memory for the
* array first.
*/
lbdid_size)) <= 0)
goto out;
/*
* For a disk that has not been replicated, extract the device ids
* stored in the locator block device id array and store them in
* a list.
*
* If the disk has been replicated using replication software such
* as HDS Truecopy/ShadowImage or EMC SRDF/BCV, the device ids in
* the locator block are invalid and we need to build a list of
* replicated disks.
*/
if (replicated && !replicated_disk_list_built) {
/*
* if there's a replicated diskset involved, we need to
* scan the system one more time and build a list of all
* candidate disks that might be part of that replicated set
*/
rval = 0;
goto out;
}
if (rval == 0)
goto out;
}
goto out;
/*
* Until here, we've gotten away with fixed sizes for the
* master block and locator block. The locator names,
* however, are sized (and therefore allocated) dynamically
* according to information in the locator block.
*/
goto out;
/*
* Read in the NM record
* If no NM record was found, it still is a valid configuration
* but it also means that we won't find any corresponding DID_NM
* or DID_SHR_NM.
*/
< 0)
goto out;
else if (rval == 0)
goto append;
/*
* At this point, we have read in all of the blocks that form
* the nm_rec. We should at least detect the corner case
* mentioned above, in which r_next_recid links to another
* nm_rec. Extended namespace handling is left for Phase 2.
*
* What this should really be is a loop, each iteration of
* which reads in a nm_rec and calls the set_append_wrapper().
*/
/*LINTED*/
extended_namespace = 1;
rval = 0;
goto out;
}
goto out;
else if (rval == 0)
goto append;
/*LINTED*/
extended_namespace = 1;
rval = 0;
goto out;
}
goto out;
else if (rval == 0)
goto append;
/*LINTED*/
extended_namespace = 1;
rval = 0;
goto out;
}
/* Finally, we've got what we need to process this replica. */
/*LINTED*/
ep, replicated);
/* Return the fact that we found at least one set */
rval = 1;
out:
if (fd >= 0)
/*
* If we are at the end of the list, we must free up
* the replicated list too
*/
if (extended_namespace)
return (rval);
}
/*
* Return the minor name associated with a given disk slice
*/
static char *
char *devname,
)
{
int fd = -1;
char *minor_name = NULL;
char *ret_minor_name = NULL;
return (NULL);
return (NULL);
}
}
return (ret_minor_name);
}
static int
)
{
int replica_count = 0;
== NULL)) {
mdclrerror(ep);
continue;
}
continue;
/*
* The drive is okay now count its replicas
*/
}
}
return (-1);
return (0);
}
static set_t
)
{
int bool;
return (MD_SET_BAD);
}
/*
* This code needs to be expanded when we run in SunCluster
* environment SunCluster obtains setno internally
*/
&bool, ep) == -1) {
setno = MD_SET_BAD;
break;
}
/*
* found one available
*/
if (bool == FALSE)
break;
}
setno = MD_SET_BAD;
}
return (setno);
}
int
char *setname,
int force,
)
{
struct mddb_config c;
char setnum_link[MAXPATHLEN];
char setname_link[MAXPATHLEN];
char *minor_name = NULL;
(void) memset(&c, 0, sizeof (c));
c.c_sideno = 0;
c.c_flags = MDDB_C_IMPORT;
/*
* Check to see if the setname that the set is being imported into,
* already exists.
*/
}
/*
* Find the next available set number
*/
}
}
c.c_timestamp = tp;
/* Check to see if replica quorum requirement is fulfilled */
/*
* We pass down the list of the drives in the
* set down to the kernel irrespective of
* whether the drives have a replica or not.
*
* The kernel detects which of the drives don't
* have a replica and accordingly does the
* right thing.
*/
== NULL)) {
mdclrerror(ep);
continue;
}
midp->mid_devid_sz);
if (midp->mid_o_devid) {
c.c_locator.l_old_devid =
}
sizeof (c.c_locator.l_minor_name));
mdclrerror(ep);
continue;
}
do {
if (mirp) {
} else {
/*
* Default offset for dummy is 16
*/
}
if (c.c_locator.l_old_devid)
}
}
/*
* If the dry run option was specified, flag success
* and exit out
*/
if (dry_run == 1) {
"import should be successful"));
if (c.c_locator.l_old_devid)
return (0);
}
/*
* Now kernel should have all the information
* regarding the import diskset replica.
* Tell kernel to load them up and import the set
*/
if (c.c_locator.l_old_devid)
}
/* The set has now been imported, create the appropriate symlink */
/*
* Since we already verified that the setname was OK, make sure to
* cleanup before proceeding.
*/
}
/* resnarf the set that has just been imported */
"restart rpc.metad"));
if (c.c_locator.l_old_devid)
return (0);
}