md_mddb.c revision b66a069d2afac09e04d1c40a2b503692940b78d8
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/systeminfo.h>
#include <sys/sysmacros.h>
1000,
{ 6000, 6000, 30000 }
};
#define MDDB
extern char svm_bootpath[];
int md_maxbootlist = MAXBOOTLIST;
static int mddb_maxbufheaders = 50;
/*
* If this is set, more detailed messages about DB init will be given, instead
* of just the MDE_DB_NODB.
*/
static int mddb_db_err_detail = 0;
/*
*/
/*
* You really do NOT want to change this boolean.
* It can be VERY dangerous to do so. Loss of
* data may occur. USE AT YOUR OWN RISK!!!!
*/
static int mddb_allow_half = 0;
/*
* For mirrored root allow reboot with only half the replicas available
* Flag inserted for Santa Fe project.
*/
int mirrored_root_flag;
((c) == '\r') || ((c) == '\n'))
extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*);
extern dev_info_t *md_devinfo;
extern int md_init_debug;
extern int md_status;
extern md_ops_t *md_opslist;
extern md_krwlock_t nm_lock;
/*
* Defines for crc calculation for records
* rec_crcgen generates a crc checksum for a record block
* rec_crcchk checks the crc checksum for a record block
*/
#define REC_CRCGEN 0
#define REC_CRCCHK 1
/*
* During upgrade, SVM basically runs with the devt from the target
* being upgraded. Translations are made from the target devt to the
* miniroot devt when writing data out to the disk. This is done by
* the following routines:
* wrtblklst
* writeblks
* readblklst
* readblks
* dt_read
*
* The following routines are used by the routines listed above and
* expect a translated (aka miniroot) devt:
* getblks
* getmasters
*
* Also, when calling any system routines, such as ddi_lyr_get_devid,
* the translated (aka miniroot) devt must be used.
*
* By the same token, the major number and major name conversion operations
* need to use the name_to_major file from the target system instead
* of the name_to_major file on the miniroot. So, calls to
* ddi_name_to_major must be replaced with calls to md_targ_name_to_major
* when running on an upgrade. Same is true with calls to
* ddi_major_to_name.
*/
#ifndef MDDB_FAKE
static int
mddb_set_t *s, /* incore db set structure */
int flag, /* B_ASYNC or 0 passed in here */
)
{
int err = 0;
if (mdv_strategy_tstpnt == NULL ||
(void) bdev_strategy(bp);
return (0);
}
return (err);
}
static void
mddb_set_t *s,
identifier_t *ident
)
{
if (s->s_setno == MD_LOCAL_SET)
else
}
static int
mddb_set_t *s,
identifier_t *ident
)
{
if (s->s_setno == MD_LOCAL_SET)
else
return (timercmp(&ident->createtime,
/*CSTYLED*/
&s->s_ident.createtime, !=));
}
static int
)
{
return (0);
return (1);
}
static void
)
{
}
/*
* stripe_skip_ts
*
* Returns a list of fields to be skipped in the stripe record structure.
* These fields are ms_timestamp in the component structure.
* Used to skip these fields when calculating the checksum.
*/
static crc_skip_t *
{
struct ms_row32_od *small_mdr;
crc_skip_t skip_start = {0, 0, 0};
if (revision == MDDB_REV_RB) {
skip_prev = &skip_start;
return (NULL);
/*
* walk through all rows to find the total number
* of components
*/
ncomps = 0;
}
/* Now walk through the components */
(comp * sizeof (ms_comp32_od_t));
KM_SLEEP);
}
} else {
skip_prev = &skip_start;
return (NULL);
/*
* walk through all rows to find the total number
* of components
*/
ncomps = 0;
}
/* Now walk through the components */
KM_SLEEP);
}
}
/* Return the start of the list of fields to skip */
return (skip_start.skip_next);
}
/*
* mirror_skip_ts
*
* Returns a list of fields to be skipped in the mirror record structure.
* This includes un_last_read and sm_timestamp for each submirror
* Used to skip these fields when calculating the checksum.
*/
static crc_skip_t *
{
int i;
crc_skip_t skip_start = {0, 0, 0};
skip_prev = &skip_start;
if (revision == MDDB_REV_RB) {
un_last_read) + rb_off;
} else {
un_last_read) + rb_off;
}
for (i = 0; i < NMIRROR; i++) {
if (revision == MDDB_REV_RB) {
} else {
}
}
/* Return the start of the list of fields to skip */
return (skip_start.skip_next);
}
/*
* hotspare_skip_ts
*
* Returns a list of the timestamp fields in the hotspare record structure.
* Used to skip these fields when calculating the checksum.
*/
static crc_skip_t *
{
if (revision == MDDB_REV_RB) {
} else {
}
return (skip);
}
/*
* rec_crcfunc
*
* Calculate or check the checksum for a record
* Calculate the crc if check == 0, Check the crc if check == 1
*
* Record block may be written by different nodes in a multi-owner diskset
* (in case of master change), the function rec_crcchk excludes timestamp
* fields in crc computation of record data.
* Otherwise, timestamp fields will cause each node to have a different
* checksum for same record block causing the exclusive-or of all record block
* checksums and data block record sums to be non-zero after new master writes
* at least one record block.
*/
static uint_t
mddb_set_t *s,
int check
)
{
/*
* Generate a list of the areas to be skipped when calculating
* the checksum.
* First skip rb_checksum, rb_private and rb_userdata.
*/
if (MD_MNSET_SETNO(s->s_setno)) {
/* For a MN set, skip rb_timestamp */
KM_SLEEP);
/* Now add a list of timestamps to be skipped */
if (type >= MDDB_FIRST_MODID) {
case MDDB_F_STRIPE:
rbp->rb_revision);
break;
case MDDB_F_MIRROR:
break;
case MDDB_F_HOTSPARE:
break;
default:
break;
}
}
}
if (check) {
} else {
}
while (skip) {
}
return (ret);
}
static mddb_bf_t *
mddb_set_t *s,
int sleepflag
)
{
if (sleepflag == MDDB_NOSLEEP)
++s->s_bufmisses;
#ifdef DEBUG
if (s->s_bufmisses == 1)
"md: mddb: set %u sleeping for buffer", s->s_setno);
#endif
s->s_bufwakeup = 1;
}
return (bfp);
}
static void
mddb_set_t *s,
)
{
s->s_freebufhead = bfp;
if (s->s_bufwakeup) {
cv_broadcast(&s->s_buf_cv);
s->s_bufwakeup = 0;
}
}
int
)
{
return (1);
return (1);
return (0);
}
static void
mddb_set_t *s,
)
{
s->s_freeblkcnt--;
}
static void
mddb_set_t *s,
)
{
s->s_freeblkcnt++;
}
static int
mddb_set_t *s,
)
{
}
/*
* not fast but simple
*/
static mddb_block_t
mddb_set_t *s,
)
{
int i;
contig = 0;
for (i = 0; i < s->s_totalblkcnt; i++) {
if (blkcheck(s, i)) {
contig = 0;
} else {
contig++;
blkbusy(s, i);
return ((mddb_block_t)contig);
}
}
}
return (0);
}
static void
mddb_set_t *s
)
{
int i;
int minblks;
int freeblks;
int nblks;
minblks = 0;
maxblk = 0;
/*
* Determine the max number of blocks.
*/
/*
* go through and find highest logical block
*/
for (i = 0; i < dep->de_blkcount; i++)
}
continue;
freeblks = 0;
}
if (freeblks == 0) /* this happen when there is no */
continue; /* master blk */
}
}
/*
* set up reasonable freespace if no
* data bases exist
*/
if (minblks == 0)
minblks = 100;
s->s_freeblkcnt = minblks;
s->s_totalblkcnt = minblks;
if (! s->s_freebitmapsize) {
KM_SLEEP);
}
/* locator block sectors */
blkbusy(s, i);
/* locator name sectors */
for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
/* locator block device id information */
for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
/* disk blocks containing actual device ids */
while (did_dbp) {
}
}
}
/* Only use data tags if not a MN set */
/* Found a bad tag, do NOT mark the data tag blks busy here */
for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
}
}
for (i = 0; i < dep->de_blkcount; i++)
}
}
/*
* Add free space to the device id incore free list.
* Called:
* - During startup when all devid blocks are temporarily placed on the
* free list
* - After a devid has been deleted via the metadb command.
* - When mddb_devid_free_get adds unused space from a disk block
* to free list
*/
static int
mddb_set_t *s,
)
{
return (0);
}
KM_SLEEP);
return (0);
}
/*
* Remove specific free space from the device id incore free list.
* Called at startup (after all devid blocks have been placed on
* free list) in order to remove the free space from the list that
* contains actual devids.
* Returns 0 if area successfully removed.
* Returns 1 if no matching area is found - so nothing removed.
*/
static int
mddb_set_t *s,
)
{
int block_found = 0;
return (1);
}
/* find free block for this devid */
while (did_freep1) {
/*
* Look through free list of <block, offset, length> to
* find our entry in the free list. Our entry should
* exist since the entire devid block was placed into
* this free list at startup. This code is just removing
* the non-free (in-use) portions of the devid block so
* that the remaining linked list does indeed just
* contain a free list.
*
* Our entry has been found if
* - the blocks match,
* - the offset (starting address) in the free list is
* less than the offset of our entry and
* - the length+offset (ending address) in the free list is
* greater than the length+offset of our entry.
*/
/* Have found our entry - remove from list */
block_found = 1;
/* did_freep1 - pts to next free block */
if (did_freep2) {
} else {
}
/*
* did_freep_before points to area in block before
* offset, length.
*/
/*
* did_freep_after points to area in block after
* offset, length.
*/
(sizeof (mddb_did_free_t), KM_SLEEP);
/*
* Add before and after areas to free list
* If area before or after offset, length has length
* of 0, that entry is not added.
*/
if (did_freep_after->free_length) {
if (did_freep2) {
} else {
s->s_did_icp->did_ic_freep =
}
} else {
sizeof (mddb_did_free_t));
}
if (did_freep_before->free_length) {
if (did_freep2) {
} else {
s->s_did_icp->did_ic_freep =
}
} else {
sizeof (mddb_did_free_t));
}
break;
} else {
}
}
if (block_found == 0) {
return (1);
} else {
return (0);
}
}
/*
* Find free space of devid length and remove free space from list.
* Return a pointer to the previously free area.
*
* If there's not enough free space on the free list, get an empty
* disk block, put the empty disk block on the did_ic_dbp linked list,
* and add the disk block space not used for devid to the free list.
*
* Return pointer to address (inside disk block) of free area for devid.
* Return 0 if error.
*/
static caddr_t
mddb_set_t *s,
)
{
return (0);
}
while (freep) {
/* found a free area - remove from free list */
/* find disk block pointer that contains free area */
while (dbp) {
break;
else
}
/*
* If a disk block pointer can't be found - something
* is wrong, so don't use this free space.
*/
continue;
}
/* Update free list information */
if (freep->free_length == 0) {
if (freep2) {
} else {
s->s_did_icp->did_ic_freep =
}
}
break;
}
}
/* Didn't find a free spot */
/* get free logical disk blk in replica */
if (blk_num == 0)
return (0);
/* Add disk block to disk block linked list */
/* Update return values */
*offset = 0;
/* Add unused part of block to free list */
(void) mddb_devid_free_add(s, blk_num,
}
}
/*
* Add device id information for locator index to device id area in set.
* Get free area to store device id from free list. Update checksum
* for mddb_did_blk.
*
* This routine does not write any data out to disk.
* After this routine has been called, the routine, writelocall, should
* be called to write both the locator block and device id area out
* to disk.
*/
static int
mddb_set_t *s,
char *minor_name
)
{
return (1);
}
return (1);
/* Check if device id has already been added */
return (0);
devid_ptr = (ddi_devid_t)
&offset);
return (1);
}
/* Copy devid into devid free area */
for (i = 0; i < devid_len; i++)
/* Update mddb_did_info area for new device id */
/* Add device id pointer to did_ic_devid array */
return (0);
}
/*
* Delete device id information for locator index from device id area in set.
* Add device id space to free area.
*
* This routine does not write any data out to disk.
* After this routine has been called, the routine, writelocall, should
* be called to write both the locator block and device id area out
* to disk.
*/
static int
{
return (1);
}
/* Get device id information from mddb_did_blk */
/*
* Ensure that the underlying device supports device ids
* before arbitrarily removing them.
*/
return (1);
}
/* Remove device id information from mddb_did_blk */
did_info->info_flags = 0;
/* Remove device id from incore area */
/* Add new free space in disk block to free list */
return (0);
}
/*
* Check if there is a device id for a locator index.
*
* Caller of this routine should not free devid or minor_name since
* these will point to internal data structures that should not
* be freed.
*/
static int
mddb_set_t *s,
char **minor_name
)
{
return (0);
}
*minor_name =
return (1);
} else
return (0);
}
/*
* Check if device id is valid on current system.
* Needs devid, previously known dev_t and current minor_name.
*
* Success:
* Returns 0 if valid device id is found and updates
* dev_t if the dev_t associated with the device id is
* different than dev_t.
* Failure:
* Returns 1 if device id not valid on current system.
*/
static int
{
int retndevs;
int devid_flag = 0;
int cnt;
if (dev == 0)
return (1);
/*
* See if devid is valid in the current system.
* If so, set dev to match the devid.
*/
if (retndevs > 0) {
/* devid is valid to use */
devid_flag = 1;
/* does dev_t in list match dev */
cnt = 0;
break;
cnt++;
}
/*
* If a different dev_t, then setup
* new dev and new major name
*/
}
}
}
if (devid_flag)
return (0);
else
return (1);
}
/*
* Free the devid incore data areas
*/
static void
{
if (icp) {
if (icp->did_ic_blkp) {
}
if (icp->did_ic_dbp) {
while (did_dbp1) {
sizeof (mddb_did_db_t));
}
}
if (icp->did_ic_freep) {
while (did_freep1) {
sizeof (mddb_did_free_t));
}
}
}
}
static daddr_t
)
{
}
}
/*
* when a buf header is passed in the new buffer must be
* put on the front of the chain. writerec counts on it
*/
static int
mddb_set_t *s, /* incore db set structure */
int cnt, /* number of blocks to be written */
/* and put buf address here */
)
{
int err = 0;
/*
* if a header for a buf chain is passed in this is async io.
* currently only done for optimize records
*/
if (bufhead) {
return (0);
}
freebuffer(s, bfp);
if (err) {
return (MDDB_F_EWRITE);
}
return (0);
}
/*
* wrtblklst - takes an array of logical block numbers
* and writes the buffer to those blocks (scatter).
* If called during upgrade, this routine expects a
* non-translated (aka target) dev.
*/
static int
mddb_set_t *s, /* incore set structure */
const int li, /* locator index */
/* and put buf address here */
int master_only /* allow only master node to write */
)
{
int err = 0;
int cons;
/*
* If a MN diskset and only the master can write,
* then a non-master node will just return success.
*/
(master_only == MDDB_WR_ONLY_MASTER)) {
/* return successfully if we aren't the master */
return (0);
}
}
return (1);
}
cons = 1;
while (cnt) {
cons++;
continue;
}
}
/*
* If an MN diskset and any_node_can_write
* then this request is coming from writeoptrecord
* and l_flags field should not be updated.
* l_flags will be updated as a result of sending
* a class1 message to the master. Setting l_flags
* here will cause slave to be out of sync with
* master.
*
* Otherwise, set the error in l_flags
* (this occurs if this is not a MN diskset or
* only_master_can_write is set).
*/
(master_only == MDDB_WR_ONLY_MASTER)) {
}
return (err);
}
if (bufhead)
if (cnt) {
}
cons = 1;
}
return (0);
}
/*
* and writes the buffer to those contiguous logical blocks.
* If called during upgrade, this routine expects a non-translated
* (aka target) dev.
*/
static int
mddb_set_t *s, /* incore set structure */
int cnt, /* number of log blocks to be written */
const int li, /* locator index */
int master_only /* allow only master node to write */
)
{
int err = 0;
int i;
int size;
int ret;
/*
* If a MN diskset and only the master can write,
* then a non-master node will just return success.
*/
(master_only == MDDB_WR_ONLY_MASTER)) {
/* return successfully if we aren't the master */
return (0);
}
}
return (1);
}
if (cnt > 1) {
for (i = 0; i < cnt; i++)
li, 0, MDDB_WR_ONLY_MASTER);
return (ret);
}
return (err);
}
return (0);
}
/*
* writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
*/
static int
mddb_set_t *s, /* incore set structure */
int cnt, /* number of log blocks to be written */
int master_only /* allow only master node to write */
)
{
int li;
int err = 0;
continue;
}
return (err);
}
/*
* writelocall - write the locator block and device id information (if
* replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
*
* Increments the locator block's commitcnt. Updates the device id area's
* commitcnt if the replica is in device id format. Regenerates the
* checksums after updating the commitcnt(s).
*/
static int
mddb_set_t *s /* incore set structure */
)
{
int li;
int err = 0;
s->s_lbp->lb_commitcnt++;
}
continue;
/* write out blocks containing actual device ids */
while (did_dbp) {
}
/* write out device id area block */
}
/* write out locator block */
}
/*
* If a MN diskset and this is the master, set the PARSE_LOCBLK flag
* in the mddb_set structure to show that the locator block has
* been changed.
*/
s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
}
return (err);
}
/*
* If called during upgrade, this routine expects a translated
* (aka miniroot) dev.
*/
static int
mddb_set_t *s, /* incore db set structure */
int cnt /* number of blocks to read */
)
{
int err = 0;
freebuffer(s, bfp);
if (err) {
return (MDDB_F_EREAD);
}
return (0);
}
/*
* readblklst - takes an array of logical block numbers
* and reads those blocks (gather) into the buffer.
* If called during upgrade, this routine expects a non-translated
* (aka target) dev.
*/
static int
mddb_set_t *s, /* incore set structure */
int li /* locator index */
)
{
int err = 0;
int cons;
return (1);
}
cons = 1;
while (cnt) {
cons++;
continue;
}
}
return (err);
if (cnt) {
}
cons = 1;
}
return (0);
}
/*
* and reads those contiguous logical blocks into the buffer.
* If called during upgrade, this routine expects a non-translated
* (aka target) dev.
*/
static int
mddb_set_t *s, /* incore set structure */
int cnt, /* number of logical blocks to be read */
int li /* locator index */
)
{
int i;
int size;
int ret;
if (cnt > 1) {
for (i = 0; i < cnt; i++)
return (ret);
}
return (1);
}
}
static void
mddb_set_t *s
)
{
while (s->s_singlelockgotten) {
s->s_singlelockwanted++;
}
s->s_singlelockgotten++;
}
static void
mddb_set_t *s
)
{
ASSERT(s->s_singlelockgotten);
s->s_singlelockgotten = 0;
if (s->s_singlelockwanted) {
s->s_singlelockwanted = 0;
}
}
static size_t
)
{
return (size);
}
static size_t
)
{
return (size);
}
static mddb_de32_t *
)
{
return (ret);
}
static void
)
{
#endif
+ sizeof (db32p->db32_firstentry)));
}
}
/*
* If called during upgrade, this routine expects a translated
* (aka miniroot) dev.
* If master blocks are found, set the mn_set parameter to 1 if the
* the master block revision number is MDDB_REV_MNMB; otherwise,
* set it to 0.
* If master blocks are not found, do not change the mnset parameter.
*/
static mddb_mb_ic_t *
mddb_set_t *s,
int *mn_set
)
{
int error = 0;
if (mddb_devopen(dev)) {
if (flag)
*flag |= MDDB_F_EMASTER;
return ((mddb_mb_ic_t *)NULL);
}
btodb(MDDB_BSIZE))) {
error |= MDDB_F_EMASTER;
}
}
/* Check for MDDB_REV_MNMB and lower */
}
}
}
}
if (error)
goto out;
/*
* Check the md_devid_destroy and md_keep_repl_state flags
* to see if we need to regen the devid or not.
*
* Don't care about devid in local set since it is not used
* and this should not be part of set importing
*/
MD_SET_IMPORT)) {
/*
* Now check the destroy flag. We also need to handle
* the case where the destroy flag is reset after the
* destroy
*/
if (md_devid_destroy) {
mb->mb_devid_len = 0;
}
/*
* Try to regenerate it if the 'keep' flag is not set
*/
if (!md_keep_repl_state) {
&devid) == DDI_SUCCESS) {
mb->mb_devid_len =
mb->mb_devid_len);
} else {
}
}
/*
* Push
*/
}
}
}
if (! error) {
/* Set mn_set parameter to 1 if a MN set */
*mn_set = 1;
else
*mn_set = 0;
return (mbi);
}
out:
/* Error Out */
if (flag)
return ((mddb_mb_ic_t *)NULL);
}
static int
mddb_set_t *s,
int li
)
{
int err = 0;
#endif
if (err) {
return (MDDB_F_EDATA | err);
}
return (MDDB_F_EFMT | MDDB_F_EDATA);
}
return (MDDB_F_EFMT | MDDB_F_EDATA);
}
/* Check crc for this record */
return (MDDB_F_EFMT | MDDB_F_EDATA);
}
return (0);
}
/*
* Code to read in the locator name information
*/
static int
mddb_set_t *s,
int li
)
{
int err = 0;
/*
* read in the locator name blocks
*/
if (err) {
err |= MDDB_F_EDATA;
goto out;
}
goto out;
}
goto out;
}
} else {
goto out;
}
}
goto out;
}
out:
/*
* if error occurred in locator name blocks free them
* and return
*/
if (err) {
return (err);
}
return (0);
}
/*
* code to read in a copy of the database.
*/
static int
mddb_set_t *s,
int li
)
{
int err = 0;
#endif
/*
* read in all the directory blocks
*/
if (! dbhp) {
} else {
}
if (err) {
err |= MDDB_F_EDATA;
break;
}
break;
}
break;
}
break;
}
/*
* first go through and fix up all de_next pointers
*/
if (dbp->db_firstentry) {
de32p = (mddb_de32_t *)
+ sizeof (db32p->db32_firstentry)));
dep = (mddb_de_ic_t *)
kmem_zalloc(sizeof (mddb_de_ic_t) -
sizeof (mddb_block_t) +
KM_SLEEP);
sizeof (mddb_de_ic_t) -
sizeof (mddb_block_t) +
sizeof (mddb_block_t) *
}
}
/*
* go through and make all of the pointer to record blocks
* are null;
*/
}
/*
* if error occurred in directory blocks free them
* and return
*/
if (err) {
while (dbp) {
while (dep) {
/* No mddb_rb32_t structures yet */
}
}
return (err);
}
/*
*/
err = 0;
continue;
if (err)
break;
/* Don't include CHANGELOG in big XOR */
continue;
}
if (err)
break;
}
if (checksum) {
if (! err)
}
if (err) {
while (dbp) {
while (dep) {
dep->de_recsize);
}
}
}
return (err);
}
static int
mddb_set_t *s,
int li)
{
int result;
#endif
result = 0;
continue;
result++;
}
}
return (result);
}
static void
mddb_set_t *s,
int opti
)
{
int li;
int blkonly = 0;
int mincnt;
int thiscnt;
/*
* scan through and see if data bases have to vary by only device
*/
blkonly = 1;
continue;
blkonly = 0;
break;
}
}
}
mincnt = 999999;
int removable = 0;
continue;
if (blkonly) {
continue;
} else {
continue;
}
}
/*
* Check if this is a removable device. If it is we
* assume it is something like a USB flash disk, a zip disk
* or even a floppy that is being used to help maintain
* mddb quorum. We don't want to put any optimized resync
* records on these kinds of disks since they are usually
* a regular fixed disk.
*/
int error;
int propvalue = 0;
int proplength = sizeof (int);
!= NULL) {
"removable-media",
if (error == DDI_PROP_SUCCESS)
removable = 1;
}
}
if (removable)
continue;
}
}
}
static void
)
{
#endif
rbp->rb_private = 0;
}
static void
)
{
#endif
/*
* If it's a driver record, and an old style record, and not a DRL
* record, we must convert it because it was incore as a 64 bit
* structure but its on disk layout has only 32 bit for block sizes
*/
(type >= MDDB_FIRST_MODID) &&
case MDDB_F_STRIPE:
break;
case MDDB_F_MIRROR:
break;
case MDDB_F_RAID:
break;
case MDDB_F_SOFTPART:
break;
case MDDB_F_TRANS_MASTER:
break;
case MDDB_F_TRANS_LOG:
break;
case MDDB_F_HOTSPARE:
break;
case MDDB_F_OPT:
default:
}
} else {
}
}
static void
mddb_set_t *s,
)
{
int li;
int i;
int err = 0;
#endif
for (i = 0; i < 2; i++) {
continue;
continue;
if (err)
continue;
continue;
continue;
/* Check the crc for this record */
continue;
}
break;
}
}
rbp->rb_private = 0;
return;
}
/* Generate the crc for this record */
}
/*
* writeoptrecord writes out an optimized record.
*/
static int
mddb_set_t *s,
)
{
int li;
int i;
#endif
err = 0;
while (s->s_opthavequeuinglck) {
s->s_optwantqueuinglck++;
}
s->s_opthavequeuinglck++;
for (i = 0; i < 2; i++) {
/*
* only possible error is xlate. This can
* occur if a replica was off line and came
* back. During the mean time the database grew
* large than the now on line replica can store
*/
continue;
/*
* In a MN diskset, any node can write optimized record(s).
*/
/*
* For MN diskset, set error in optinfo structure so
* that mddb_commitrec knows which replica failed.
*/
if ((MD_MNSET_SETNO(s->s_setno)) &&
(wrt_err & MDDB_F_EWRITE)) {
}
}
s->s_opthavequeuinglck = 0;
if (s->s_optwantqueuinglck) {
s->s_optwantqueuinglck = 0;
cv_broadcast(&s->s_optqueuing_cv);
}
/*
* If an MN diskset, don't set replica
* in error since this hasn't been set in master.
* Setting replica in error before master could
* leave the nodes with different views of the
* world since a class 1 configuration change
* could occur in mddb_commitrec as soon as
* all locks are dropped. Must keep this
* node the same as master and can't afford a
* failure from the class 1 config change
* if master succeeded.
*/
if (!(MD_MNSET_SETNO(s->s_setno))) {
} else {
/*
* Find which de_optinfo (which replica)
* had a failure and set the failure in
* the o_flags field.
*/
} else {
}
}
err |= MDDB_F_EWRITE;
}
freebuffer(s, bfp);
}
return (err);
}
/*
* Fix up the optimized resync record. Used in the traditional and local
* disksets to move an optimized record from a failed or deleted mddb
* to an active one.
*
* In a MN diskset, the fixing of the optimized record is split between
* the master and slave nodes. If the master node moves the optimized
* resync record, then the master node will send a MDDB_PARSE_OPTRECS
* message to the slave nodes causing the slave nodes to reget the
* directory entry containing the location of the optimized resync record.
* After the record is reread from disk, then writeoptrecord is called
* if the location of the optimized resync record or flags have changed.
* When writeoptrecord is called, the node that is the owner of this record
* will write the optimized record to the location specified in the directory
* entry. Since the master node uses the highest class message (PARSE)
* the record owner node is guaranteed to already have an updated
* directory entry incore.
*
* The other difference between the traditional/local set and MN diskset
* is that the directory entry can be written to disk before the optimized
* record in a MN diskset if the record is owned by a slave node. So,
* the users of an optimized record must handle the failure case when no
* data is available from an optimized record since the master node could
* have failed during the relocation of the optimized record to another mddb.
*/
static int
mddb_set_t *s,
)
{
int changed;
int writedata;
int err = 0;
int i;
int rec_owner; /* Is node owner of record? */
#endif
changed = 0;
writedata = 0;
for (i = 0; i < 2; i++) {
/*
* If optimized record has seen a replica failure,
* assign new replica to record and re-write data
* to new record.
*/
writedata++;
changed++;
/* Set flag for slaves to reread dep and write rec */
}
}
/*
* If just an error in the data was seen, set
* the optimized record's replica flag to active (ok)
* and try again.
*/
writedata++;
}
}
rec_owner = 0;
/*
* If a MN diskset then check the owner of optimized record.
* If the master node owns the record or if there is
* no owner of the record, then the master can write the
* optimized record to disk.
* Master node can write the optimized record now, but
* slave nodes write their records during handling of
* the MDDB_PARSE_OPTRECS message.
*/
rec_owner = 1;
}
} else {
/*
* In traditional diskset and local set, this node
* is always the record owner and always the master.
*/
rec_owner = 1;
}
/*
* If this node is the record owner, write out record.
*/
return (err);
}
}
if (! changed)
return (0);
1, MDDB_WR_ONLY_MASTER);
return (err);
}
static int
mddb_set_t *s
)
{
int err = 0;
/*
* In a MN diskset, the master node is the only node that runs
* fixoptrecords. If the master node changes anything, then the
* master node sends PARSE message to the slave nodes. The slave
* nodes will then re-read in the locator block or re-read in the
* directory blocks and re-write the optimized resync records.
*/
return (0);
}
continue;
if (err != 0)
return (err);
}
}
return (0);
}
/*
* Checks incore version of mddb data to mddb data ondisk.
*
* Returns:
* - 0 if the data was successfully read and is good.
* - MDDB_F_EREAD if a read error occurred.
* - 1 if the data read is bad (checksum failed, etc)
*/
static int
(
mddb_set_t *s,
int li
)
{
int i;
int retval = 1;
#endif
if (s->s_databuffer_size == 0) {
s->s_databuffer_size = maxrecsize;
}
/*
* first go through and make sure all directory stuff
* is the same
*/
goto err;
}
goto err;
goto err;
goto err;
goto err;
goto err;
if (cdb32p->db32_firstentry) {
cde32p = (mddb_de32_t *)
+ sizeof (cdb32p->db32_firstentry)));
} else
/*
* check if all directory entries are identical
*/
goto err;
goto err;
goto err;
goto err;
goto err;
for (i = 0; i < 2; i++) {
break;
}
if (i != 2)
goto err;
goto err;
else
}
goto err;
}
/*
* If here, all directories are functionally identical
* check to make sure all records are identical
* the reason the records are not just bcmped is that the
* lock flag does not want to be compared.
*/
continue;
goto err;
}
/* Check the crc for this record */
goto err;
goto err;
}
}
return (0);
err:
return (retval);
}
/*
* Determine if the location information for two mddbs is the same.
* The device slice and block offset should match. If both have devids then
* use that for the comparison, otherwise we compare the dev_ts.
* Comparing with the devid allows us to handle the case where a mddb was
* relocated to a dead mddbs dev_t. The live mddb will have the dev_t of
* the dead mddb but the devid comparison will catch this and not match.
*
* Return 1 if the location of the two mddbs match, 0 if not.
*/
static int
{
/*
* If this element is errored then we don't try to match on it.
* If we try to match we could erroneously match on the dev_t
* of a relocated disk.
*/
return (0);
}
return (0);
} else {
return (0);
}
return (0);
return (1);
}
static int
int flag)
{
int sz;
if (MD_UPGRADE) {
} else {
return (EINVAL);
}
/*
* Get dev associated with device id and minor name.
* Setup correct driver name if dev is now different.
* Don't change driver name if during upgrade.
*/
}
} else {
/* Mark as invalid */
}
}
if (dev_2b_fixed)
r = *rip;
while (r) {
r->ri_flags |= MDDB_F_EMASTER;
} else {
}
return (0); /* already entered return success */
}
r = r->ri_next;
}
/*
* This replica not represented in the current rip list,
* so add it to the list.
*/
}
}
KM_SLEEP);
(char *)r->ri_old_devid, sz);
} else {
r->ri_old_devid = 0;
}
/*
* Devid is present, but not valid. This could
* happen if device has been powered off or if
* the device has been removed. Mark the device in
* error. Don't allow any writes to this device
* based on the dev_t since another device could
* have been placed in its spot and be responding to
* the dev_t accesses.
*/
r->ri_flags |= MDDB_F_EMASTER;
}
} else {
r->ri_devid = 0;
r->ri_old_devid = 0;
}
/*
* If the rip list is empty then this entry
* is the list.
*/
*rip = r;
return (0);
}
/*
* Add this entry to the end of the rip list
*/
return (0);
}
/*
* writecopy writes the incore data blocks out to all of the replicas.
* This is called from writestart
* - when a diskset is started or
* - when an error has been enountered during the write to a mddb.
* and from newdev when a new mddb is being added.
*
* flag can be 2 values:
* MDDB_WRITECOPY_ALL - write all records to all mddbs. This is
* always used for traditional and local disksets.
* For MN diskset:
* All nodes can call writecopy, but only the
* master node actually writes data to the disk
* except for optimized resync records.
* An optimized resync record can only be written to
* by the record owner.
* MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new
* master has been chosen, the new master may need to
* write its incore mddb to disk (this is the case where the
* old master had executed a message but hadn't relayed it
* to this slave yet). New master should not write the
* change log records since new master would be overwriting
* valuable data. Only used during a reconfig cycle.
*/
static int
mddb_set_t *s,
int li,
int flag
)
{
int err = 0;
#endif
if (err)
return (err);
/*
* In a multinode diskset, when a new master is
* chosen the new master may need to write its
* incore copy of the mddb to disk. In this case,
* don't want to overwrite the change log records
* so new master sets flag to MDDB_WRITECOPY_SYNC.
*/
if (flag == MDDB_WRITECOPY_SYNC) {
continue;
}
/*
* In a multinode diskset, don't write out optimized
* resync resyncs since only the mirror owner node
* will have the correct data. If writecopy is
* being called from writestart as a result of
* an mddb failure, then writestart will handle
* the optimized records when it calls fixoptrecords.
*/
if ((MD_MNSET_SETNO(s->s_setno)) &&
continue;
}
/* Generate the crc for this record */
return (err);
}
}
return (0);
}
static int
mddb_set_t *s,
char *tag
)
{
int medok;
int li;
int alc;
int lc;
/* If no mediator hosts, nothing to do */
return (0);
/*
* If this is a MN set and we are not the master, then don't
* update mediator hosts or mark mediator as golden since
* only master node should do that.
*/
return (0);
}
meddb.med_dat_fl = 0;
/* count accessible mediators */
/* count accessible and existing replicas */
continue;
lc++;
continue;
alc++;
}
/*
* Mediator update quorum is >= 50%: check for less than
* "mediator update" quorum.
*/
/* panic if <= 50% of all replicas are accessible */
"md: Update of 50%% of the mediator hosts failed");
/* NOTREACHED */
}
"md: Update of 50%% of the mediator hosts failed");
}
/*
* If we have mediator update quorum and exactly 50% of the replicas
* are accessible then mark the mediator as golden.
*/
}
return (0);
}
static int
push_lb(mddb_set_t *s)
{
/* push the change to all the replicas */
if (MD_MNSET_SETNO(s->s_setno)) {
} else {
}
return (writelocall(s));
}
/* Should not call for MN diskset since data tags are not supported */
static int
{
int diff = 0;
if (diff)
return (diff);
if (diff)
return (diff);
if (diff)
return (diff);
/*CSTYLED*/
}
/* Should not call for MN diskset since data tags are not supported */
static int
{
int nextid = 0;
/* Run to the end of the list */
return (0);
nextid++;
}
/* Add the new member */
/* Update the dtag portion of the list */
sizeof (mddb_dtag_t));
/* Fix up the id value */
return (0);
}
/*
* Even though data tags are not supported in MN disksets, dt_cntl may
* be called for a MN diskset since this routine is called even before
* it is known the kind of diskset being read in from disk.
* For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
*/
static int
dtl_cntl(mddb_set_t *s)
{
int ndt = 0;
ndt++;
}
return (ndt);
}
/*
* Even though data tags are not supported in MN disksets, dt_cntl may
* be called for a MN diskset since this routine is called even before
* it is known the kind of diskset being read in from disk.
* For a MNdiskset, s_dtlp is 0 so a 0 is returned.
*/
static mddb_dtag_t *
{
}
return ((mddb_dtag_t *)NULL);
}
/* Should not call for MN diskset since data tags are not supported */
static void
{
}
}
/*
* Even though data tags are not supported in MN disksets, dt_setup will
* be called for a MN diskset since this routine is called even before
* it is known the kind of diskset being read in from disk.
* Once this set is known as a MN diskset, the dtp area will be freed.
*/
static void
{
/* shorthand */
/* Initialize the setno */
/* Clear the id and flags, this is only used in user land */
/* Checksum it */
}
/* Should not call for MN diskset since data tags are not supported */
static int
{
if (lbp->lb_dtblkcnt == 0) {
/* Data tags not used in a MN set - so no failure returned */
return (0);
"No tag record allocated, unable to tag data");
return (1);
}
/* Clear the stack variable */
/* Get the HW serial number for this host */
/* Get the nodename that this host goes by */
/* Get a time stamp for NOW */
/* Setup the data tag record */
/* Free any list of tags if they exist */
/* Put the new tag onto the tag list */
return (0);
}
/*
* If called during upgrade, this routine expects a non-translated
* (aka target) dev.
* Should not call for MN diskset since data tags are not supported.
*/
static int
{
int err = 0;
/* If have not allocated a data tag record, there is nothing to do */
if (lbp->lb_dtblkcnt == 0)
return (1);
return (1);
/* shorthand */
return (1);
}
/* error reading the tag */
if (err) {
err = 1;
goto out;
}
tbuf += MDDB_BSIZE;
}
/* magic is valid? */
err = 1;
goto out;
}
/* revision is valid? */
err = 1;
goto out;
}
/* crc is valid? */
err = 1;
goto out;
}
/* shorthand */
/* set number match? */
err = 1;
goto out;
}
/* tag is not empty? */
err = 2;
goto out;
}
/* Mark the locator as having tagged data */
out:
if (err) {
if (err == 1) {
}
}
}
return (err);
}
/* Should not call for MN diskset since data tags are not supported */
static int
dt_write(mddb_set_t *s)
{
int li;
int err = 0;
int werr;
int empty_tag = 0;
/* Nowhere to write to */
if (lbp->lb_dtblkcnt == 0)
return (err);
if (set_status & MD_SET_BADTAG)
return (err);
/* shorthand */
/* See if the tag is empty. */
empty_tag = 1;
/* Write the tag to the locators and reset appropriate flags. */
continue;
if (werr) {
continue;
}
if (empty_tag)
else {
}
}
if (err)
return (err);
/* If the tags were written, check to see if any tags remain. */
continue;
break;
}
/* If there are no tags, then clear CLRTAG and TAGDATA */
}
return (err);
}
/* Should not call for MN diskset since data tags are not supported */
static int
{
int i;
int li;
int moveit = 0;
/*
* If the data tag record is allocated (blkcnt != 0) and a bad tag was
* not detected, there is nothing to do.
*/
return (0);
/* Bitmap not setup, checks can't be done */
if (s->s_totalblkcnt == 0)
return (0);
/* While reading the tag(s) an invalid tag data record was seen */
if (set_status & MD_SET_BADTAG)
/* See if the invalid tag needs to be moved */
for (i = 0; i < MDDB_DT_BLOCKS; i++)
moveit = 1;
break;
}
/* Need to move or allocate the tag data record */
if (lbp->lb_dtfirstblk == 0) {
"Unable to allocate data tag record");
return (0);
}
/* Mark the locators so that they get written to disk. */
continue;
}
return (1);
}
/*
* Make sure the blocks are owned, since the calculation in
* computefreeblks() is bypassed when MD_SET_BADTAG is set.
*/
for (i = 0; i < MDDB_DT_BLOCKS; i++)
return (1);
}
/*
* Writestart writes the incore mddb out to all of the replicas.
* This is called when a diskset is started and when an error has
* been enountered during the write to a mddb.
*
* flag can be 2 values:
* MDDB_WRITECOPY_ALL - write all records to all mddbs. This is
* always used for traditional and local disksets.
* This is the normal path for MN disksets since the slave
* nodes aren't actually allowed to write to disk.
* MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new
* master has been chosen, the new master may need to
* write its incore mddb to disk (this is the case where the
* old master had executed a message but hadn't relayed it
* to this slave yet). New master should not write the
* change log records since new master would be overwriting
* valuable data. Only used during a reconfig cycle.
*/
static int
mddb_set_t *s,
int flag
)
{
int li;
int err = 0;
continue;
continue;
return (1);
}
continue;
continue;
return (1);
}
/*
* Call fixoptrecord even during a reconfig cycle since a replica
* failure may force the master to re-assign the optimized
* resync record to another replica.
*/
if (fixoptrecords(s))
return (1);
/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
continue;
break;
if ((set_status & MD_SET_TAGDATA) ||
(set_status & MD_SET_CLRTAG))
break;
}
/*
* If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
* the lbp identifier and the set identifier doesn't match.
*/
/* Only call for traditional and local sets */
(void) dt_write(s);
return (err);
(void) upd_med(s, "writestart(0)");
return (err);
(void) upd_med(s, "writestart(1)");
else
lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
* names have changed.
* Don't set parseflags as a result of a new master sync
* during reconfig cycle since slaves nodes are already
* in-sync with the new master.
*/
(flag != MDDB_WRITECOPY_SYNC)) {
s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
}
if (err)
return (err);
}
continue;
} else {
}
}
return (0);
}
/*
* selectreplicas selects the working replicas and may write the incore
* version of the mddb out to the replicas ondisk.
*
* flag can be 3 values:
* MDDB_RETRYSCAN - quick scan to see if there is an error.
* If no new error, returns without writing mddb
* to disks. If a new error is seen, writes out
* mddb to disks.
* MDDB_SCANALL - lengthy scan to check out mddbs and always writes
* out mddb to the replica ondisk. Calls writecopy
* with MDDB_WRITECOPY_ALL flag which writes out
* all records to the replicas ondisk.
* MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
* and ondisk mddbs by writing incore values to disk.
* Calls writecopy with MDDB_WRITECOPY_SYNC flag so
* that change log records are not written out.
* Only used by MN disksets.
*
* Returns:
* 0 - Successful
* 1 - Unable to write incore mddb data to disk since < 50% replicas.
*/
int
mddb_set_t *s,
int flag
)
{
int li;
int alc;
int lc;
int wc_flag;
/*
* can never transition from stale to not stale
*/
continue;
} else {
}
}
return (1);
}
continue;
} else {
}
} else {
}
}
computefreeblks(s); /* set up free block bits */
} else {
continue;
break;
}
/*
* if there are no errors this is error has already
* been processed return current state
*/
do {
}
alc = 0;
lc = 0;
continue;
lc++;
continue;
alc++;
}
return (1);
}
/* Set wc_flag based on flag passed in. */
if (flag == MDDB_SCANALLSYNC)
else
do {
if (! writestart(s, wc_flag)) {
return (0);
}
alc = 0;
continue;
continue;
}
alc++;
}
return (1);
}
static int
mddb_set_t *s,
int probe
)
{
int error;
return (0);
if (probe == MDDB_NOPROBE)
return (1);
if (error == 0)
s->s_zombie = 0;
}
return (error);
}
static int
mddb_set_t *s
)
{
if (selectreplicas(s, MDDB_RETRYSCAN))
if (selectreplicas(s, MDDB_SCANALL))
return (1);
return (0);
}
static void
{
}
}
static mddb_ri_t *
save_rip(mddb_set_t *s)
{
while (trip) {
/* Run to the end of the list */
/* void */;
/* Add the new member */
/* shorthand */
/* Clear the stuff that is not needed for hints */
rip->ri_commitcnt = 0;
rip->ri_transplant = 0;
}
return (nrip);
}
static void
{
}
}
}
}
/*
* this routine selects the correct replica to use
* the rules are as follows
* 1. if all replica has same init time select highest commit count
* 2. if some but not all replicas are from another hostid discard
* them.
* 3. find which init time is present is most replicas
* 4. discard all replicas which do not match most init times
* 5. select replica with highest commit count
*/
static mddb_lb_t *
mddb_set_t *s
)
{
int different;
int same;
int count;
int maxcount;
int mn_set = 0;
/* Clear the ri_transplant flag on all the rip entries. */
/* Set ri_commitcnt to locator's commitcnt - if available */
r->ri_transplant = 0;
/* If any locators have MN bit set, set flag */
mn_set = 1;
}
}
/*
* A data tag is being used, so use it to limit the selection first.
* Data tags not used in MN diskset.
*/
/*
* now toss any locators that have a different data tag
*/
continue;
/* If same tag, keep it */
continue;
}
}
if (!(md_get_setstatus(setno) &
}
}
r->ri_transplant = 1;
}
/* Tag used, clear the bit */
/*
* Get rid of the list of tags.
*/
/*
* Re-create the list with the tag used.
*/
}
}
/*
* scan to see if all replicas have same time
*/
continue;
continue;
}
/* CSTYLED */
break;
}
/*
* if r == NULL then they were all them same. Choose highest
* commit count
*/
goto out;
/*
* If here, a bogus replica is present and at least 1 lb_inittime
* did not match.
*/
/*
* look and see if any but not all are from different id
*/
different = 0;
same = 0;
continue;
different = 1;
else
same = 1;
}
/*
* now go through and throw out different if there are some
* that are the same
*/
continue;
continue;
}
if (!(md_get_setstatus(setno) &
}
}
r->ri_transplant = 1;
}
}
/*
* go through and pick highest. Use n square because it is
* simple and 40 some is max possible
*/
maxcount = 0;
continue;
count = 0;
continue;
&r->ri_lbp->lb_inittime, ==))
count++;
}
}
}
/*
* now go though and toss any that are of a different time stamp
*/
continue;
&r->ri_lbp->lb_inittime, ==))
continue;
}
}
}
r->ri_transplant = 1;
}
out:
/*
* Find the locator with the highest commit count, and make it the
* "chosen" one.
*/
continue;
continue;
}
}
/* Toss all locator blocks, except the "chosen" one. */
continue;
/* Get rid of all dtp's */
}
continue;
/* Get rid of extra locator devid block info */
}
}
/* Get rid of extra locators */
}
return (lbp);
}
static void
int li,
)
{
int mn_set = 0;
mn_set = 1;
for (i = 0; i < MD_MNMAXSIDES; i++) {
break;
}
if (i == MD_MNMAXSIDES)
return;
} else {
}
/* copy device id from mddb to cfg_loc structure */
for (i = 0; i < sz; i++) {
}
} else {
}
}
}
}
/*
* Even if a devid exists, use the dev, drvnm and mnum in the locators
* and sidelocators. During startup, the dev, drvnm and mnum in
* these structures may not match the devid (the locators and
* sidelocators will be updated to match the devid by the routine
* load_old_replicas). Using out-of-sync values won't cause any
* problems since ridev will re-derive these from the devid and mnum.
* After startup, the dev, drvnm and mnum in these structures have
* been updated and can be used.
*/
if (mn_set) {
} else {
}
}
/*
* Find the index into the mnsidelocator where entry will go.
* Then index can be fed into both splitname2locatorblocks and
* cfgloc2locator so that those entries can be kept in sync.
*
* Returns:
* -1 if failed to find unused slot or if a traditional diskset
* index, if successful (0 <= index <= MD_MNMAXSIDES)
*/
static int
int li,
)
{
uchar_t i;
int index = -1;
/*
* Checking side locator structure. First, check if
* there is already an entry for this side. If so,
* then use that entry. Otherwise, find an entry
* that has a sideno of 0.
*/
for (i = 0; i < MD_MNMAXSIDES; i++) {
/* Found a match - stop looking */
index = i;
break;
/* Set first empty slot, but keep looking */
index = i;
}
}
/* Didn't find empty slot or previously used slot */
return (-1);
}
return (index);
} else
return (0);
}
/*
* Takes locator information (driver name, minor number, sideno) and
* stores it in the locator block.
* For traditional diskset, the sideno is the index into the sidelocator
* array in the locator block.
* For the MN diskset, the sideno is the nodeid which can be any number,
* so the index passed in is the index into the mnsidelocator array
* in the locator block.
*/
static int
int li,
int index /* Only useful in MNsets when > 1 */
)
{
uchar_t i;
mddb_set_t *s;
int mn_set = 0;
mn_set = 1;
/*
* Index will be the slot that has the given sideno or
* the first empty slot if no match is found.
* This was pre-checked out in check locator.
*/
} else {
}
/*
* Look for the driver name
*/
for (i = 0; i < MDDB_DRVNMCNT; i++) {
continue;
MD_MAXDRVNM) == 0)
break;
}
/*
* Didn't find one, add a new one
*/
if (i == MDDB_DRVNMCNT) {
for (i = 0; i < MDDB_DRVNMCNT; i++) {
break;
}
if (i == MDDB_DRVNMCNT)
return (1);
}
/* Fill in the drvnm index */
if (mn_set) {
mnslp->mnl_drvnm_index = i;
} else {
slp->l_drvnm_index = i;
}
/*
* This device id could already be associated with this index
* if this is not the first side added to the set.
* If device id is 0, there is no device id for this device.
*/
return (0);
clp->l_minor_name)) {
return (1);
}
}
return (0);
}
/*
* See if there are mediator hosts and try to use the data.
*/
static int
mddb_set_t *s
)
{
int medok = 0;
int medacc = 0;
int golden = 0;
int err = 1;
/* Do not have a mediator, then the state is stale */
return (err);
/* Contact the mediator hosts for the data */
/* No mediator data, stale */
return (err);
/* Mark all the mediator data that is not for this set as errored */
/* Count the number of mediators contacted */
medacc++;
/* Paranoid check */
/*CSTYLED*/
}
/* Get the max commitcount */
maxcc = 0;
continue;
}
/* Now mark the records that don't have the highest cc as errored */
continue;
}
/* Now mark the records that don't match the lb commitcnt as errored */
continue;
}
/* Is there a "golden" copy and how many valid mediators */
continue;
golden++;
medok++;
}
/* No survivors, stale */
if (medok == 0)
goto out;
/* No mediator quorum and no golden copies, stale */
/* Skip odd numbers, no exact 50% */
goto out;
/* Have 50%, allow an accept */
goto out;
}
/* We either have a quorum or a golden copy, or both */
err = 0;
out:
if (meddlp) {
}
}
return (err);
}
/*
* 1. read masterblks and locator blocks for all know database locations
* a. keep track of which have good master blks
* b. keep track of which have good locators
*
*/
static int
mddb_set_t *s,
int *write_lb
)
{
/* May be cast to mddb_mnlb_t */
/* if accessing sidenames in */
/* MN set */
mddb_did_blk_t *did_blkp = 0;
int did_blkp_sz = 0;
int li;
int retval = 0;
int err = 0;
int lb_ok = 0;
int lb_total = 0;
int lb_tagged = 0;
int lb_tags;
int cont_flag, i;
int mn_set = 0;
/*
* read in master blocks and locator block for all known locators.
* lb_blkcnt will be set correctly for MN set later once getmasters
* has determined that the set is a MN set.
*/
/*
* Translated dev is only used in calls to getmasters and
* getblks which expect a translated (aka miniroot) dev.
*/
/* Set error flag that getmasters would have set */
/* if getmasters had been allowed to fail */
}
/*
* Invalid device id on system (due to failed or
* removed device) or invalid devt during upgrade
* (due to powered off device) will cause this
* replica to be marked in error and not used.
*/
continue;
/* get all master blocks, does mddb_devopen() */
/* if invalid master block - try next replica */
continue;
/*
* If lbp alloc'd to wrong size - reset it.
* If MN set, lb_blkcnt must be MDDB_MNLBCNT.
* If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
*/
if (lbp) {
}
}
/* If a MN set, set lb_blkcnt for MN loc blk size */
if (mn_set)
KM_SLEEP);
}
/*
* Read in all the sectors for the locator block
* NOTE: Need to use getblks, rather than readblklst.
* because it is too early and things are
* NOT set up yet for read*()'s
*/
btodb(MDDB_BSIZE));
if (err) {
break;
}
buffer += MDDB_BSIZE;
}
if (err)
continue;
/* Verify the locator block */
continue;
continue;
continue;
if (mn_set) {
/* If a MN set, check for MNLB revision in lb. */
continue;
} else {
/* If not a MN set, check for LB revision in lb. */
continue;
}
continue;
/*
* With the addition of MultiNode Disksets, we must make sure
* to verify that this is the correct set. A node could
* have been out of the config for awhile and this disk could
* have been moved to a different diskset and we don't want
* to accidentally start the wrong set.
*
* We don't do this check if we're in the middle of
* importing a set.
*/
continue;
/*
* a commit count of zero means this locator has been deleted
*/
if (lbp->lb_commitcnt == 0)
continue;
/*
* If replica is in the device ID style and md_devid_destroy
* flag is set, turn off device id style. This is only to be
* used in a catastrophic failure case. Examples would be
* where the device id of all drives in the system
* (especially the mirror'd root drives) had been changed
* by firmware upgrade or by a patch to an existing disk
* driver. Another example would be in the case of non-unique
* device ids due to a bug. The device id would be valid on
* the system, but would return the wrong dev_t.
*/
lbp->lb_didfirstblk = 0;
lbp->lb_didblkcnt = 0;
*write_lb = 1;
}
/*
* If replica is in device ID style, read in device ID
* block and verify device ID block information.
*/
/* Read in device ID block */
did_icp = (mddb_did_ic_t *)
kmem_zalloc(sizeof (mddb_did_ic_t),
KM_SLEEP);
} else {
/* Reuse did_icp, but clear out data */
if (did_icp->did_ic_blkp !=
(mddb_did_blk_t *)NULL) {
(mddb_did_blk_t *)NULL;
}
if (did_icp->did_ic_dbp !=
(mddb_did_db_t *)NULL) {
while (did_dbp1) {
sizeof (mddb_did_db_t));
}
(mddb_did_db_t *)NULL;
}
for (i = 0; i < MDDB_NLB; i++) {
did_icp->did_ic_devid[i] =
}
}
/* Can't reuse blkp since size could be different */
}
KM_SLEEP);
blk++) {
btodb(MDDB_BSIZE));
if (err) {
break;
}
buffer += MDDB_BSIZE;
}
if (err)
continue;
/* Verify the Device ID block */
continue;
continue;
continue;
continue;
continue;
/*
* Check if device ID block is out of sync with the
* Locator Block by checking if the locator block
* commitcnt does not match the device id block
* commitcnt. If an 'out of sync' condition
* exists, discard this replica since it has
* inconsistent data and can't be used in
* determining the best replica.
*
* An 'out of sync' condition could happen if old
* SDS code was running with new devid style replicas
* or if a failure occurred between the writing of
* the locator block's commitcnt and the device
* id block's commitcnt.
*
* If old SDS code had been running, the upgrade
* process should detect this situation and
* have removed all of the device id information
* via the md_devid_destroy flag in md.conf.
*/
if (did_blkp->blk_commitcnt !=
lbp->lb_commitcnt) {
continue;
}
}
/*
* If replica is still in device ID style, read in all
* of the device IDs, verify the checksum of the device IDs.
*/
/*
* Reset valid bit in device id info block flags. This
* flag is stored on disk, but the valid bit is reset
* when reading in the replica. If the corresponding
* device id is valid (aka meaning that the system
* knows about this device id), the valid bit will
* be set at a later time. The valid bit for this
* replica's device ID will be set in this routine.
* The valid bits for the rest of the device id's
* will be set after the 'best' replica has
* been selected in routine load_old_replicas.
* Reset updated bit in device id info block flags.
* This flag is also stored on disk, reset when read
* in and set when the locators and side locators
* have been updated to match this valid device
* id information.
*/
did_info->info_flags &=
}
cont_flag = 0;
/* Check if block has already been read in */
while (did_dbp != 0) {
if (did_dbp->db_firstblk ==
break;
else
}
/* if block not found, read it in */
btodb(MDDB_BSIZE));
if (err) {
break;
}
buffer += MDDB_BSIZE;
}
if (err) {
cont_flag = 1;
break;
}
/*
* Block read in - alloc Disk Block area
*/
sizeof (mddb_did_db_t), KM_SLEEP);
/* Add to front of dbp list */
}
/* Check validity of devid in block */
cont_flag = 1;
break;
}
/* Block now pointed to by did_dbp */
}
}
if (cont_flag)
continue;
}
/*
* All blocks containing devids are now in core.
*/
/*
* If we're doing a replicated import (also known as
* remote copy import), the device id in the locator
* block is incorrect and we need to fix it up here
* alongwith the l_dev otherwise we run into lots of
* trouble later on.
*/
continue;
continue;
continue;
continue;
if (ddi_devid_compare(
continue;
}
/* update l_dev */
}
}
}
/*
* If there is a valid devid, verify that this locator
* block has information about itself by checking the
* device ID, minor_name and block
* number from this replica's incore data structure
* against the locator block information that has just
* been read in from disk.
*
* If not a valid devid, verify that this locator block
* has information about itself by checking the minor
* number, block number and driver name from this
* replica's incore data structure against the locator
* block information that has just been read in from disk.
*/
/*
* This locator block MUST have locator (replica)
* information about itself. Check against devid,
* slice part of minor number, and block number.
*/
continue;
continue;
if ((md_get_setstatus(setno) &
continue;
} else {
continue;
}
did_info->info_minor_name) != 0)
continue;
break;
}
} else {
/*
* This locator block MUST have locator (replica)
* information about itself.
*/
if (!mn_set) {
continue;
continue;
continue;
MD_MAXDRVNM) == 0)
break;
}
} else {
int i;
/*
* Check all possible locators locking for
* match to the currently read-in locator,
* must match on:
* - blkno
* - side locator for this node's side
* - side locator minor number
* - side locator driver name
*/
/* Looking at sidelocs - cast lbp -> mnlbp */
continue;
continue;
for (i = 0; i < MD_MNMAXSIDES; i++) {
break;
}
}
/* No matching side found */
if (i == MD_MNMAXSIDES)
continue;
continue;
MD_MAXDRVNM) == 0)
break;
}
}
}
/*
* Didn't find ourself in this locator block it means
* the locator block is a stale transplant. Probably from
* a user doing a dd.
*/
continue;
/*
* Keep track of the number of accessed and valid
* locator blocks.
*/
lb_ok++;
/*
* Read the tag in, skips invalid or blank tags.
* Only valid tags allocate storage
* Data tags are not used in MN disksets.
*/
/*
* Keep track of the number of tagged
* locator blocks.
*/
lb_tagged++;
/* Keep a list of unique tags. */
}
/*
* go through locator block and add any other
* locations of the data base.
* For the replicated import case, this was done earlier
* and we really don't need or want to do so again
*/
continue;
cl->l_devid_sz = 0;
cl->l_old_devid_sz = 0;
did_icp);
== NULL) {
continue;
} else {
cl->l_devid_flags =
}
}
did_icp);
}
}
/* Save LB for later */
} else
}
}
while (did_dbp1) {
sizeof (mddb_did_db_t));
}
}
}
}
/* No locator blocks were ok */
if (lb_ok == 0)
goto out;
/* No tagged data was found - will be 0 for MN diskset */
if (lb_tagged == 0)
goto out;
/* Find the highest non-deleted replica count */
int lb_tot = 0;
continue;
continue;
continue;
lb_tot++;
}
}
/* Count the number of unique tags */
lb_tags++;
/* Should have at least one tag at this point */
/*
* If the number of tagged locators is not the same as the number of
* OK locators OR more than one tag exists, then make sure the
* selected tag will be written out later.
*/
/* Only a single tag, take the tagged data */
if (lb_tags == 1) {
goto out;
}
/* Multiple tags, not selecting a tag, tag mode is on */
out:
return (retval);
}
/*
* 1. Select a locator.
* 2. check if enough locators now have current copies
* 3. read in database from one of latest
* 4. if known to have latest make all database the same
* 5. if configuration has changed rewrite locators
*
* Parameters:
* s - pointer to mddb_set structure
* flag - used in MN disksets to tell if this node is being joined to
* a diskset that is in the STALE state. If the flag is
* MDDB_MN_STALE, then this node should be marked in the STALE
* state even if > 50% mddbs are available. (The diskset can
* only change from STALE->OK if all nodes withdraw from the
* MN diskset and then rejoin).
*/
static int
mddb_set_t *s,
int flag
)
{
int li;
int alc;
int lc;
int tlc;
int retval = 0;
caddr_t p;
mddb_sidelocator_t *slp = 0;
mddb_mnsidelocator_t *mnslp = 0;
uchar_t i;
char *name;
char *minor_name;
int write_lb = 0;
/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
goto errout;
goto errout;
}
/* If a multi-node set, then set md_set.s_status flag */
/*
* If data tag area had been allocated before set type was
* known - free it now.
*/
}
}
/*
* If the replica is in devid format, setup the devid incore ptr.
*/
break;
}
}
/*
* If no devid incore info found - something has gone
* wrong so errout.
*/
goto errout;
}
/*
* Add all blocks containing devids to free list.
* Then remove addresses that actually contain devids.
*/
while (did_dbp1) {
goto errout;
}
}
continue;
/* unable to find disk block */
goto errout;
}
}
}
/*
* create mddb_mbaray, count all locators and active locators.
*/
alc = 0;
lc = 0;
continue;
/* Count non-deleted replicas */
lc++;
/*
* Use the devid of this locator to compare with the rip
* list. The scenario to watch out for here is that this
* locator could be on a disk that is dead and there could
* be a valid entry in the rip list for a different disk
* that has been moved to the dead disks dev_t. We don't
* want to match with the moved disk.
*/
break;
}
}
/*
* If rip not found, then mark error in master block
* so that no writes are later attempted to this
* replica. rip may not be setup if ridev
* failed due to un-found driver name.
*/
continue;
}
if (rip->ri_transplant)
alc++;
}
/* Save on a divide - calculate 50% + 1 up front */
} else { /* alc == tlc && even - ? */
/* Can do an accept, and are */
} else { /* possibly has a mediator */
if (mediate(s)) {
} else {
}
}
/*
* The mirrored_root_flag allows the sysadmin to decide to
* when there are only 50% available mddbs on the system and
* when the root file system is on a mirror. This is useful
* in a 2 disk system where 1 disk failure would cause an mddb
* quorum failure and subsequent boot failures since the root
* filesystem would be in a read-only state.
*/
svm_bootpath[0] != 0) {
} else {
/* Allow half mode - CAREFUL! */
if (mddb_allow_half)
}
}
/*
* In a MN diskset,
* - if 50% mddbs are unavailable and this
* has been marked STALE above
* - master node isn't in the STALE state
* - this node isn't the master node (this node
* isn't the first node to join the set)
* then clear the STALE state and set TOOFEW.
*
* If this node is the master node and set was marked STALE,
* then the set stays STALE.
*
* If this node is not the master and this node's state is
* STALE and the master node is not marked STALE,
* then master node must be in the TOOFEW state or the
* master is panic'ing. A MN diskset can only be placed into
* the STALE state by having the first node join the set
* with <= 50% mddbs. There's no way for a MN diskset to
* transition between STALE and not-STALE states unless all
* nodes are withdrawn from the diskset or all nodes in the
* diskset are rebooted at the same time.
*
* So, mark this node's state as TOOFEW instead of STALE.
*/
== (MD_SET_MNSET | MD_SET_STALE)) &&
((flag & MDDB_MN_STALE) == 0) &&
}
}
/*
* If a MN set is marked STALE on the other nodes,
* mark it stale here. Override all other considerations
* such as a mediator or > 50% mddbs available.
*/
if (flag & MDDB_MN_STALE)
}
/*
* read a good copy of the locator names
* if an error occurs reading what is suppose
* to be a good copy continue looking for another
* good copy
*/
continue;
/* Find rip entry for this locator if one exists */
break;
}
continue;
}
continue;
}
continue;
}
/*
* Now have a copy of the database that is equivalent
* to the chosen locator block with respect to
* inittime, identifier and commitcnt. Trying the
* equivalent databases in the order that they were
* written will provide the most up to date data.
*/
if (s->s_lnp)
break;
}
goto errout;
}
/*
* read a good copy of the data base
* if an error occurs reading what is suppose
* to be a good copy continue looking for another
* good copy
*/
continue;
/* Find rip entry for this locator if one exists */
break;
}
continue;
}
continue;
}
continue;
}
/*
* Now have a copy of the database that is equivalent
* to the chosen locator block with respect to
* inittime, identifier and commitcnt. Trying the
* equivalent databases in the order that they were
* written will provide the most up to date data.
*/
if (s->s_dbp)
break;
}
goto errout;
}
/*
* go through and find largest record;
* Also fixup the user data area's
*/
getoptrecord(s, dep);
else {
}
if (maxrecsize > s->s_databuffer_size) {
if (s->s_databuffer_size)
s->s_databuffer = p;
s->s_databuffer_size = maxrecsize;
}
/* If we can clear the tag data record, do it now. */
/* Data tags not supported on MN sets */
/* This will return non-zero if STALE or TOOFEW */
/* This will write out chosen replica image to all replicas */
if (selectreplicas(s, MDDB_SCANALL))
goto errout;
if (rip->ri_old_devid == 0)
continue;
devidptr) != 0) {
continue;
}
if (update_locatorblock(s,
goto errout;
}
}
}
}
}
/*
* If the replica is in device id style - validate the device id's,
* if present, in the locator block devid area.
*/
continue;
/* Validate device id on current system */
if (mddb_devid_validate(
did_info->info_minor_name) == 0) {
/* Set valid flag */
} else {
}
} else if (!(MD_UPGRADE)) {
/*
* If a device doesn't have a device id,
* check if there is now a device ID
* associated with device. If one exists,
* add it to the locator block devid area.
* If there's not enough space to add it,
* print a warning.
* Don't do this during upgrade.
*/
DDI_SUCCESS) {
== DDI_SUCCESS) {
if (mddb_devid_add(s, li,
ret_devid, minor_name)) {
"Not enough space in"
" metadevice state"
" database\n");
"to add relocation"
" information for"
" device:\n");
" major = %d, "
" minor = %d\n",
} else {
write_lb = 1;
}
}
}
}
}
/*
* If a device has a valid device id and if the dev_t
* associated with the device id has changed, update the
* driver name, minor num and dev_t in the local and side
* locators to match the dev_t that the system currently
* associates with the device id.
*
* Don't do this during upgrade.
*/
if (!(MD_UPGRADE)) {
continue;
int j;
int index = -1;
for (j = 0; j < MD_MNMAXSIDES; j++) {
lb_mnsidelocators[j][li];
if (mnslp->mnl_sideno ==
s->s_sideno)
break;
if (mnslp->mnl_sideno == 0)
index = j;
}
if (j == MD_MNMAXSIDES) {
/* No match found; take empty */
write_lb = 1;
write_lb = 1;
}
} else {
write_lb = 1;
}
}
i = mnslp->mnl_drvnm_index;
} else {
i = slp->l_drvnm_index;
}
/* Driver name has changed */
/* Look for the driver name */
for (i = 0; i < MDDB_DRVNMCNT; i++) {
!= len)
continue;
if (strncmp(
break;
}
/* Didn't find one, add it */
if (i == MDDB_DRVNMCNT) {
for (i = 0; i < MDDB_DRVNMCNT;
i++) {
== 0)
break;
}
if (i == MDDB_DRVNMCNT) {
"Unable to update driver"
" name for dev: "
"major = %d, "
"minor = %d\n",
continue;
}
(void) strncpy(
name, MD_MAXDRVNM);
}
/* Fill in the drvnm index */
mnslp->mnl_drvnm_index = i;
} else {
slp->l_drvnm_index = i;
}
write_lb = 1;
}
}
}
}
}
/*
* If locator block has been changed by get_mbs_n_lbs,
* by addition of new device id, by updated minor name or
* by updated driver name - write out locator block.
*/
if (write_lb) {
if (push_lb(s))
goto errout;
}
/*
* If the tag was moved, allocated, or a BADTAG was seen for some other
* reason, then make sure tags are written to all the replicas.
* Data tags not supported on MN sets.
*/
if (! (lc = dt_alloc_if_needed(s))) {
continue;
lc = 1;
break;
}
}
}
if (lc) {
(void) selectreplicas(s, MDDB_SCANALL);
}
}
/* Free extraneous rip components. */
/* Get rid of lbp's and dtp's */
}
}
}
}
}
continue;
break;
}
continue;
}
/*
* Get rid of mbp's:
* if lbp, those out of lb_loccnt bounds
* if !lbp, all of them.
*/
}
}
/*
* Turn off MDDB_F_EMASTER flag in a diskset since diskset
* code always ends up calling ridev for all replicas
* before calling load_old_replicas. ridev will reset
* MDDB_F_EMASTER flag if flag was due to unresolved devid.
*/
if (setno != MD_LOCAL_SET)
}
return (retval);
}
/*
* Given the devt from the md.conf info, get the devid for the device.
*/
static void
{
char *minor;
return;
}
return;
}
return;
}
}
/*
* grab driver name, minor, block and devid out of
* strings like "driver:minor:block:devid"
*/
static int
char *str,
)
{
char *p, *e;
char *minor_name;
*p++ = *str++;
*p = '\0';
if (*str++ != ':')
return (-1);
}
if (*str++ != ':')
return (-1);
}
if (*str++ != ':')
return (-1);
/*
* If the md_devid_destroy flag is set, ignore the device ids.
* This is only to used in a catastrophic failure case. Examples
* would be where the device id of all drives in the system
* (especially the mirror'd root drives) had been changed
* by firmware upgrade or by a patch to an existing disk
* driver. Another example would be in the case of non-unique
* device ids due to a bug. The device id would be valid on
* the system, but would return the wrong dev_t.
*/
if (md_devid_destroy) {
clp->l_devid_flags = 0;
clp->l_devid_sz = 0;
clp->l_old_devid_sz = 0;
return (0);
}
if (ddi_devid_str_decode(str,
return (-1);
clp->l_devid_flags = 0;
clp->l_old_devid_sz = 0;
/* If no device id associated with device, just return */
clp->l_devid_sz = 0;
md_keep_repl_state == 0) {
/*
* No devid in md.conf; we're in recovery mode so
* lookup the devid for the device as specified by
* the devt in md.conf.
*/
}
return (0);
}
return (0);
}
/*
* grab driver name, minor, and block out of
* strings like "driver:minor:block:devid driver:minor:block:devid ..."
*/
static void
char *str
)
{
char *p, *e;
char restore_space;
/* CSTYLED */
for (p = str; (*p != '\0'); ) {
for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
;
if (*p == '\0')
break;
for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
;
/*
* Only give parse_db_loc 1 entry, so stuff a null into
* the string if we're not at the end. We need to save this
* char and restore it after call.
*/
restore_space = '\0';
if (*e != '\0') {
restore_space = *e;
*e = '\0';
}
if (parse_db_loc(p, cl) != 0) {
} else {
(void) ridev(
cl->l_devid_sz);
}
}
if (restore_space != '\0') {
*e = restore_space;
}
p = e;
}
}
/*
* grab database locations supplied by md.conf as properties
*/
static void
parse_db_strings(void)
{
int bootlist_id;
int proplen;
/*
* size of _bootlist_name should match uses of line and entry in
* libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
*/
char *bootlist_name;
/*
* Step through the bootlist properties one at a time by forming the
* correct name, fetching the property, parsing the property and
* then freeing the memory. If a property does not exist or returns
* some form of error just ignore it. There is no guarantee that
* the properties will always exist in sequence, for example
* mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
* mddb_bootlist3 existing.
*/
bootlist_name = &_bootlist_name[0];
proplen = 0;
&proplen) != DDI_PROP_SUCCESS)
continue;
if (proplen <= 0)
continue;
if (md_init_debug)
}
}
static int
int flag
)
{
int i;
mddb_set_t *s;
int retval = 0;
int devid_flag;
return (MDDB_E_NOTNOW);
}
/*
* init is already underway, block. Return success.
*/
if (s->s_lbp) {
return (0);
}
uniqtime32(&s->s_inittime);
if (setno == MD_LOCAL_SET)
s->s_zombie = 0;
s->s_staledeletes = 0;
s->s_optcmtcnt = 0;
s->s_opthavelck = 0;
s->s_optwantlck = 0;
s->s_optwaiterr = 0;
s->s_opthungerr = 0;
/*
* KEEPTAG can never be set for a MN diskset since no tags are
* allowed to be stored in a MN diskset. No way to check
* if this is a MN diskset or not at this point since the mddb
* hasn't been read in from disk yet. (flag will only have
* MUTLINODE bit set if a new set is being created.)
*/
for (i = 0; i < mddb_maxbufheaders; i++) {
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
freebuffer(s, bfp);
}
/* If 0 return value - success */
if (! retval) {
return (0);
}
/*
* If here, then the load_old_replicas() failed
*/
/* If the database was supposed to exist. */
if (flag & MDDB_MUSTEXIST) {
for (i = 0; i < mddb_maxcopies; i++) {
if (! s->s_mbiarray[i])
continue;
dev = md_expldev(
free_mbipp(&s->s_mbiarray[i]);
}
}
sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
s->s_mbiarray = NULL;
}
}
}
if (retval == MDDB_E_TAGDATA)
return (retval);
/* Want a bit more detailed error messages */
if (mddb_db_err_detail)
return (retval);
return (MDDB_E_NODB);
}
/*
* MDDB_NOOLDOK set - Creating a new database, so do
* more initialization.
*/
if (flag & MDDB_MULTINODE) {
}
if (flag & MDDB_MULTINODE) {
} else {
}
if (flag & MDDB_MULTINODE) {
for (i = 0; i < MDDB_NLB; i++) {
mnslp->mnl_sideno = 0;
mnslp->mnl_drvnm_index = 0;
}
}
} else {
for (i = 0; i < MDDB_NLB; i++) {
}
}
}
/* lb starts on block 0 */
/* locator names starts after locator block */
if (flag & MDDB_MULTINODE) {
} else {
}
if (flag & MDDB_MULTINODE) {
/* Creating a multinode diskset */
}
/* Data portion of mddb located after locator names */
/* the btodb that follows is converting the directory block size */
/* Data tag part of mddb located after first block of mddb data */
btodb(MDDB_BSIZE));
/* Data tags are not used in MN diskset - so set count to 0 */
if (flag & MDDB_MULTINODE)
else
if (flag & MDDB_MULTINODE) {
} else {
}
/*
* Set up Device ID portion of Locator Block.
* Do not set locator to device id style if
* md_devid_destroy is 1 and md_keep_repl_state is 1
* (destroy all device id data and keep replica in
* non device id mode).
*
* This is logically equivalent to set locator to
* device id style if md_devid_destroy is 0 or
* md_keep_repl_state is 0.
*
* In SunCluster environment, device id mode is disabled
* which means diskset will be run in non-devid mode. For
* localset, the behavior will remain intact and run in
* device id mode.
*
* In multinode diskset devids are turned off.
*/
devid_flag = 1;
if (setno != MD_LOCAL_SET)
devid_flag = 0;
if (flag & MDDB_MULTINODE)
devid_flag = 0;
devid_flag = 0;
/*
* if we weren't devid style before and md_keep_repl_state=1
* we need to stay non-devid
*/
(md_keep_repl_state == 1))
devid_flag = 0;
if (devid_flag) {
(sizeof (mddb_did_ic_t), KM_SLEEP);
did_blkp = (mddb_did_blk_t *)
}
dbp->db_nextblk = 0;
return (0);
}
int flag,
int *errorcodep
)
{
mddb_set_t *s;
int err = 0;
if (errorcodep != NULL)
return (NULL);
}
/* Allocate s_un and s_ui arrays if not already present. */
if (errorcodep != NULL)
return (NULL);
}
}
if (errorcodep != NULL)
return (NULL);
}
}
if (s->s_lbp)
return (s);
if (flag & MDDB_NOINIT)
return (s);
/*
* Release the set mutex - it will be acquired and released in
* initit after acquiring the mddb_lock. This is done to assure
* that mutexes are always acquired in the same order to prevent
* possible deadlock
*/
if (errorcodep != NULL)
*errorcodep = err;
return (NULL);
}
}
/*
* Release the set lock for a given set.
*
* In a MN diskset, this routine may send messages to the rpc.mdcommd
* in order to have the slave nodes re-parse parts of the mddb.
* Messages are only sent if the global ioctl lock is not held.
*
* With the introduction of multi-threaded ioctls, there is no way
* to determine which thread(s) are holding the ioctl lock. So, if
* the ioctl lock is held (by process X) process X will send the
* messages to the slave nodes when process X releases the ioctl lock.
*/
void
mddb_set_t *s
)
{
int i;
int rval = 1;
/*
* If not a MN diskset OR
* a MN diskset but this node isn't master,
* then release the mutex.
*/
if (!(MD_MNSET_SETNO(s->s_setno)) ||
((MD_MNSET_SETNO(s->s_setno)) &&
return;
}
/*
* If global ioctl lock is held, then send no messages,
* just release mutex and return.
*
*/
if (md_status & MD_GBL_IOCTL_LOCK) {
return;
}
/*
* This thread is not holding the ioctl lock, so drop the set
* lock, send messages to slave nodes to reparse portions
* of the mddb and return.
*
* If the block parse flag is set, do not send parse messages.
* This flag is set when master is adding a new mddb that would
* cause parse messages to be sent to the slaves, but the slaves
* don't have knowledge of the new mddb yet since the mddb add
* operation hasn't been run on the slave nodes yet. When the
* master unblocks the parse flag, the parse messages will be
* generated.
*
* If s_mn_parseflags_sending is non-zero, then another thread
* is already currently sending a parse message, so just release
* the mutex and return. If an mddb change occurred that results
* in a parse message to be generated, the thread that is currently
* sending a parse message would generate the additional parse message.
*
* If s_mn_parseflags_sending is zero and parsing is not blocked,
* then loop until s_mn_parseflags is 0 (until there are no more
* messages to send).
* While s_mn_parseflags is non-zero,
* put snapshot of parse_flags in s_mn_parseflags_sending
* set s_mn_parseflags to zero
* release mutex
* send message
* re-grab mutex
* set s_mn_parseflags_sending to zero
*/
KM_SLEEP);
while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
(s->s_mn_parseflags & MDDB_PARSE_MASK) &&
/* Grab snapshot of parse flags */
s->s_mn_parseflags_sending = s->s_mn_parseflags;
s->s_mn_parseflags = 0;
/*
* Send the message to the slaves to re-parse
* the indicated portions of the mddb. Send the status
* of the 50 mddbs in this set so that slaves know which
* mddbs that the master node thinks are 'good'.
* Otherwise, slave may reparse, but from wrong replica.
*/
for (i = 0; i < MDDB_NLB; i++) {
mddb_parse_msg->msg_lb_flags[i] =
}
while (rval != 0) {
(char *)mddb_parse_msg,
sizeof (mddb_parse_msg), kresult);
if (rval != 0)
"mddb update message to other nodes in "
"diskset %s\n", s->s_setname);
}
/*
* Re-grab mutex to clear sending field and to
* see if another parse message needs to be generated.
*/
s->s_mn_parseflags_sending = 0;
}
}
static void
mddb_set_t *s
)
{
}
{
char *minor_name;
int retval;
int err;
/* Need disk block(s) to hold mddb_did_blk_t */
if (doit) {
/*
* Alloc mddb_did_blk_t disk block and fill in header area.
* Don't fill in did magic number until end of routine so
* if machine panics in the middle of conversion, the
* device id information will be thrown away at the
* next snarfing of this set.
* Need to set DEVID_STYLE so that mddb_devid_add will
* function properly.
*/
/* grab the mutex */
return (1);
}
if (lbp->lb_didfirstblk == 0) {
mddb_setexit(s);
return (1);
}
KM_SLEEP);
KM_SLEEP);
}
/* Fill in information in mddb_did_info_t array */
continue;
/*
* No translation available for replica.
* Could fail conversion to device id replica,
* but instead will just continue with next
* replica in list.
*/
continue;
}
/*
* Just count each devid as at least 1 block. This
* is conservative since several device id's may fit
* into 1 disk block, but it's better to overestimate
* the number of blocks needed than to underestimate.
*/
if (doit) {
&minor_name) == DDI_SUCCESS) {
minor_name)) {
"Not enough space in metadb"
" to add device id for"
" dev: major = %d, "
"minor = %d\n",
}
}
}
}
}
if (doit) {
mddb_setexit(s);
if (retval != 0)
return (1);
}
return (0);
}
static mddb_set_t *
int flag,
int *errp
)
{
mddb_set_t *s;
}
if (setno >= MD_MAXSETS)
return ((mddb_set_t *)NULL);
if (setno == MD_LOCAL_SET) {
} else {
KM_SLEEP);
}
/* have a config struct, copy mediator information */
}
void
)
{
mddb_set_t *s;
int i;
return;
s->s_opthavequeuinglck = 0;
s->s_optwantqueuinglck = 0;
if (dep->de_icreqsize)
dep->de_icreqsize);
else
dep->de_reqsize);
}
}
}
for (i = 0; i < mddb_maxcopies; i++) {
if (! s->s_mbiarray)
break;
if (! s->s_mbiarray[i])
continue;
free_mbipp(&s->s_mbiarray[i]);
}
}
if (s->s_mbiarray) {
sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
}
if (s->s_lnp) {
}
if (s->s_lbp) {
}
if (s->s_freebitmap) {
s->s_freebitmap = NULL;
s->s_freebitmapsize = 0;
}
if (s->s_databuffer_size) {
s->s_databuffer_size = 0;
}
/* Data tags not supported on MN sets. */
ASSERT(s->s_singlelockwanted == 0);
kmem_free(s, sizeof (mddb_set_t));
/* Take care of things setup in the md_set array */
}
}
}
/*
* returns 0 if name can be put into locator block
* returns 1 if locator block prefixes are all used
*
* Takes splitname (suffix, prefix, sideno) and
* stores it in the locator name structure.
* For traditional diskset, the sideno is the index into the suffixes
* array in the locator name structure.
* For the MN diskset, the sideno is the nodeid which can be any number,
* so the index passed in is the index into the mnsuffixes array
* in the locator structure. This index was computed by the
* routine checklocator which basically checked the locator block
* mnside locator structure.
*/
static int
int li,
int index
)
{
uchar_t i;
for (i = 0; i < MDDB_PREFIXCNT; i++) {
continue;
break;
}
if (i == MDDB_PREFIXCNT) {
for (i = 0; i < MDDB_PREFIXCNT; i++) {
break;
}
if (i == MDDB_PREFIXCNT)
return (1);
}
/* If a MN diskset, use index */
} else {
sn->suf_prefix = i;
}
return (0);
}
/*
* Find the locator name for the given sideno and convert the locator name
* information into a splitname structure.
*/
void
int li,
)
{
int iprefix;
int i;
for (i = 0; i < MD_MNMAXSIDES; i++) {
break;
}
if (i == MD_MNMAXSIDES)
return;
} else {
}
}
static int
int command,
)
{
mddb_set_t *s;
int err = 0;
int i, j;
int li;
int flags = MDDB_MUSTEXIST;
/*
* Data checking
*/
}
flags |= MDDB_MN_STALE;
/* shorthand */
if (set_status & MD_SET_STALE)
if (set_status & MD_SET_TOOFEW)
/*
* go through and count active entries
*/
for (i = 0; i < loccnt; i++) {
continue;
}
/*
* add the ability to accept a locator block index
* which is not relative to previously deleted replicas. This
* is for support of MD_DEBUG=STAT in metastat since it asks for
* replica information specifically for each of the mirror resync
* records. MDDB_CONFIG_SUBCMD uses one of the pad spares in
* the mddb_config_t type.
*/
mddb_setexit(s);
setno));
}
} else {
mddb_setexit(s);
setno));
}
/* CSTYLED */
continue;
j++;
break;
}
}
if (command == MDDB_ENDDEV) {
blk = 0;
}
} else {
}
}
if (command != MDDB_DELDEV) {
mddb_setexit(s);
return (0);
}
if (MD_UPGRADE) {
"Deletion of replica not allowed during upgrade.\n");
mddb_setexit(s);
}
/*
* If here, replica delete in progress.
*/
lbp->lb_commitcnt = 0;
/*
* Don't need to write out device id area, since locator
* block on this replica is being deleted by setting the
* commitcnt to 0.
*/
}
if (s->s_mbiarray[li])
}
s->s_mbiarray[li] = 0;
/* Only support data tags for traditional and local sets */
setno != MD_LOCAL_SET)
mdclrerror(ep);
/* Write data tags to all accessible devices */
/* Only support data tags for traditional and local sets */
(void) dt_write(s);
}
/* Delete device id of deleted replica */
(void) mddb_devid_delete(s, li);
}
/* write new locator to all devices */
err = writelocall(s);
(void) upd_med(s, "getdeldev(0)");
computefreeblks(s); /* recompute always it may be larger */
err |= fixoptrecords(s);
if (err) {
if (writeretry(s)) {
mddb_setexit(s);
}
}
mddb_setexit(s);
return (0);
}
static int
)
{
/*
* Data checking
*/
return (EINVAL);
return (EINVAL);
if (MD_UPGRADE)
else
return (0);
}
/*
* update_valid_replica - updates the locator block namespace (prefix
* RETURN
* 1 Error
* 0 Success
*/
static int
mddb_set_t *s,
int li,
char *devname,
char *pathname,
)
{
uchar_t i;
return (0);
}
return (1);
/*
* Future note: Need to do something here for the MN diskset case
* when device ids are supported in disksets.
* Can't add until merging devids_in_diskset code into code base
* Currently only called with side of 0.
*/
/*
* If new prefix is the same as the previous prefix - no change.
*
* If new prefix is not the same, check if new prefix
* matches an existing one. If so, use that one.
*
* If new prefix doesn't exist, add a new prefix. If not enough
* space, return failure.
*/
/* Check if new prefix is the same as the old prefix. */
pre_len) != 0)) {
/* Check if new prefix is an already known prefix. */
for (i = 0; i < MDDB_PREFIXCNT; i++) {
continue;
}
pre_len) == 0) {
break;
}
}
/* If no match found for new prefix - add the new prefix */
if (i == MDDB_PREFIXCNT) {
for (i = 0; i < MDDB_PREFIXCNT; i++) {
break;
}
/* No space to add new prefix - return failure */
if (i == MDDB_PREFIXCNT) {
return (1);
}
}
sn->suf_prefix = i;
}
/* Now, update the suffix (Ex: c0t0d0s0) if needed */
}
return (0);
}
/*
* md_update_locator_namespace - If in devid style and active and the devid's
* exist and are valid update the locator namespace pathname
* and devname.
* RETURN
* 1 Error
* 0 Success
*/
int
char *dname,
char *pname,
)
{
mddb_set_t *s;
int li;
int err = 0;
return (1);
/* must be DEVID_STYLE */
continue;
}
/* replica also must be active */
/* only update if did exists and is valid */
if ((flg & MDDB_DID_EXISTS) &&
(flg & MDDB_DID_VALID)) {
err = 1;
goto out;
}
}
}
}
}
else
lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
* names have changed.
*/
s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
}
out:
mddb_setexit(s);
if (err)
return (1);
return (0);
}
/*
* update_locatorblock - for active entries in the locator block, check
* the devt to see if it matches the given devt. If so, and
* there is an associated device id which is not the same
* as the passed in devid, delete old devid and add a new one.
* RETURN
* MDDB_E_NODEVID
* MDDB_E_NOLOCBLK
* 1 Error
* 0 Success
*/
static int
{
int li;
int retval = 0;
char *minor_name;
/* find replicas that haven't been deleted */
continue;
}
/*
* check to see if locator devt matches given dev
* and if there is a device ID associated with it
*/
(flg & MDDB_DID_EXISTS)) {
if (flg & MDDB_DID_VALID) {
continue; /* cont to nxt active entry */
}
return (MDDB_E_NODEVID);
}
/*
* devid's not equal so
* delete and add
*/
(void) mddb_devid_delete(s, li);
break;
} else {
retval = 1;
goto err_out;
}
}
}
} /* end for */
return (retval);
}
static int
mddb_set_t *s,
)
{
int mb2free = 0;
int err = 0;
/*
* There is case where a disk may not have mddb,
* and only has dummy mddb which contains
* a valid devid we like to update and in this
* case, the rip_lbp will be NULL but we still
* like to update the devid embedded in the
* dummy mb block.
*
*/
} else {
/*
* Done if it is non-replicated set
*/
KM_SLEEP);
mb2free = 1;
} else {
goto out;
}
}
/*
*/
/*
* Zero out what we have previously
*/
if (mb->mb_devid_len)
}
/*
* putblks will
*
* - drop the s_dbmx lock
* - biowait
* - regain the s_dbmx lock
*
* Need to update this if we wants to handle
* mb_next != NULL which it is unlikely will happen
*/
if (mb2free) {
}
out:
return (err);
}
static int
)
{
mddb_set_t *s;
int err = 0;
/*
* Data integrity check
*/
return (EINVAL);
return (0);
return (-1);
}
return (-1);
}
return (-1);
continue;
/*
* We only update what is asked
*/
err = -1;
goto out;
}
}
}
err = -1;
goto out;
}
out:
mddb_setexit(s);
return (err);
}
static int
int command,
)
{
mddb_set_t *s;
int li;
int err = 0;
char *minor_name;
if (MD_UPGRADE) {
"Addition and deletion of sides not allowed"
" during upgrade. \n");
}
/*
* Data integrity check
*/
mddb_setexit(s);
}
/*
*/
== DDI_SUCCESS)) {
use_devid = 1;
}
}
}
continue;
if (use_devid) {
continue;
if ((ddi_devid_compare(devid,
break;
}
} else {
break;
}
}
}
if (use_devid)
mddb_setexit(s);
}
if (command == MDDB_NEWSIDE) {
int index = 0;
/*
* If a MN diskset, need to find the index where the new
* locator information is to be stored in the mnsidelocator
* field of the locator block so that the locator name can
* be stored at the same array index in the mnsuffixes
* field of the locator names structure.
*/
if (use_devid) {
}
mddb_setexit(s);
}
}
/*
* Store the locator name before the sidelocator information
* in case a panic occurs between these 2 steps. Must have
* the locator name information in order to print reasonable
* error information.
*/
if (use_devid)
mddb_setexit(s);
setno));
}
if (use_devid)
mddb_setexit(s);
setno));
}
}
if (use_devid)
if (command == MDDB_DELSIDE) {
int i;
int j;
for (j = 0; j < MD_MNMAXSIDES; j++) {
break;
}
if (j < MD_MNMAXSIDES) {
mnslp->mnl_sideno = 0;
sizeof (md_mnname_suffix_t));
}
} else {
}
}
}
/* write new locator names to all devices */
else
lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
* names have changed.
*/
s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
}
if (err) {
if (writeretry(s)) {
mddb_setexit(s);
}
}
/* write new locator to all devices */
err = writelocall(s);
computefreeblks(s); /* recompute always it may be larger */
if (err) {
if (writeretry(s)) {
mddb_setexit(s);
}
}
mddb_setexit(s);
return (0);
}
static int
int command,
)
{
mddb_set_t *s;
int i, j;
int li;
int err = 0;
char *minor_name;
int old_flags;
int flags;
int mn_set = 0;
int index;
/* Currently don't allow addition of new replica during upgrade */
if (MD_UPGRADE) {
"Addition of new replica not allowed during upgrade.\n");
}
/*
* Data integrity check
*/
/* Determine the flag settings for multinode sets */
if (cp->c_multi_node)
flags |= MDDB_MULTINODE;
if (err != MDDB_E_NOTOWNER)
if (s == NULL)
}
/* shorthand */
/* shorthand */
mddb_setexit(s);
}
/*
*/
use_devid = 1;
}
}
}
continue;
if (use_devid) {
continue;
if ((ddi_devid_compare(devid2,
if (command == MDDB_NEWDEV) {
mddb_setexit(s);
return (mdmddberror(ep,
}
}
} else {
if (command == MDDB_NEWDEV) {
mddb_setexit(s);
return (mdmddberror(ep,
}
}
}
}
/*
* Really is a new replica, go get the master blocks
*/
if (! mbip) {
if (use_devid)
mddb_setexit(s);
}
/*
* Compute free blocks in replica.
*/
computefreeblks(s);
/*
* Check if this is large enough
*/
for (j = i; j < s->s_totalblkcnt; j++) {
if (blkcheck(s, j)) {
while (mbip) {
}
if (use_devid)
mddb_setexit(s);
setno));
}
}
/* Look for a deleted slot */
break;
}
/* If no deleted slots, add a new one */
/* Already have the max replicas, bail */
if (use_devid)
mddb_setexit(s);
setno));
}
}
/* Initialize the new or deleted slot */
/* shorthand */
index = 0;
/*
* If a MN diskset, need to find the index where the new
* locator information is to be stored in the mnsidelocator
* field of the locator block so that the locator name can
* be stored at the same array index in the mnsuffixes
* field of the locator names structure.
*/
if (use_devid)
l_devid);
mddb_setexit(s);
}
}
/*
* Store the locator name before the sidelocator information
* in case a panic occurs between these 2 steps. Must have
* the locator name information in order to print reasonable
* error information.
*/
if (use_devid)
mddb_setexit(s);
}
/*
* Compute free blocks in replica before calling cfgloc2locator
* since cfgloc2locator may attempt to alloc an unused block
* to store the device id.
* mbiarray needs to be setup before calling computefreeblks.
*/
computefreeblks(s);
if (use_devid)
s->s_mbiarray[li] = 0;
mddb_setexit(s);
}
if (use_devid)
/* write db copy to new device */
/* write new locator names to all devices */
else
lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
* names have changed.
*/
s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
}
if (err) {
if (writeretry(s)) {
mddb_setexit(s);
}
}
/* Data tags not supported on MN sets */
setno != MD_LOCAL_SET)
mdclrerror(ep);
/* Write data tags to all accessible devices */
/* Data tags not supported on MN sets */
(void) dt_write(s);
}
/* write new locator to all devices */
err = writelocall(s);
(void) upd_med(s, "newdev(0)");
computefreeblks(s); /* recompute always it may be smaller */
if (err) {
if (writeretry(s)) {
mddb_setexit(s);
}
}
mddb_setexit(s);
return (0);
}
#ifdef DEBUG
static void
)
{
mddb_set_t *s;
return;
if (dep->de_rb_userdata)
}
}
}
#endif /* DEBUG */
/*
* Exported Entry Points
*/
#ifdef DEBUG
void
mddb_check(void)
{
int i;
for (i = 0; i < md_nsets; i++) {
return;
mddb_check_set(i);
}
}
#endif /* DEBUG */
int
)
{
mddb_set_t *s;
int flag = 0;
int err = 0;
mdclrerror(ep);
switch (command) {
case MDDB_NEWDEV:
break;
case MDDB_NEWSIDE:
case MDDB_DELSIDE:
break;
case MDDB_GETDEV:
case MDDB_DELDEV:
case MDDB_ENDDEV:
break;
case MDDB_GETDRVRNAME:
break;
case MDDB_USEDEV:
/*
* Note: must allow USEDEV ioctl during upgrade to support
* auto-take disksets.
*
* Also during the set import if the md_devid_destroy
* flag is set then error out
*/
break;
}
}
if (setno == MD_LOCAL_SET)
flag = MDDB_F_IOCTL;
}
mddb_setexit(s);
break;
case MDDB_RELEASESET:
break;
case MDDB_SETDID:
break;
default:
}
return (err);
}
int
)
{
mddb_set_t *s;
return (EINVAL);
return (0);
continue;
mddb_setexit(s);
return (0);
}
}
mddb_setexit(s);
return (0);
}
void
mddb_init(void)
{
mddb_set_t *s;
mddb_setexit(s);
}
void
mddb_unload(void)
{
int i;
for (i = 0; i < md_nsets; i++) {
mddb_unload_set(i);
}
crcfreetab();
}
)
{
mddb_set_t *s;
/* LINTED variable unused - used for sizeof calculations */
int i, err = 0;
void *userdata;
#endif
/*
* everyone is supposed to sepcify if it's a
* 32 bit or a 64 bit record
*/
return (MDDB_E_INVALID);
}
return (err);
if (checkstate(s, MDDB_PROBE)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
if (mddb_maxblocks)
else
maxblocks = (MDDB_BSIZE -
mddb_setexit(s);
return (MDDB_E_INVALID);
}
/*
* allocate record block
* and new directory block so to avoid sleeping
* after starting single_thread
*/
if ((options & MD_CRO_OPTIMIZE) == 0)
/*
* if this is the largest record allocate new buffer for
* checkcopy();
*/
if (recsize > s->s_databuffer_size) {
/*
* this test is incase when to sleep during kmem_alloc
* and some other task bumped max record size
*/
if (recsize > s->s_databuffer_size) {
if (s->s_databuffer_size)
s->s_databuffer_size);
s->s_databuffer = tmppnt;
s->s_databuffer_size = recsize;
} else {
}
}
newid = 0;
do {
newid++;
if ((options & MD_CRO_OPTIMIZE) == 0)
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
break;
}
break;
}
} while (dbp);
(sizeof (mddb_block_t) * blkcnt);
/*
* see if a directory block exists which will hold this entry
*/
}
break;
}
if (dbp) {
if (blkcnt > s->s_freeblkcnt) {
if ((options & MD_CRO_OPTIMIZE) == 0)
mddb_setexit(s);
return (MDDB_E_NOSPACE);
}
} else {
/*
* need to add directory block
*/
if ((options & MD_CRO_OPTIMIZE) == 0)
mddb_setexit(s);
return (MDDB_E_NOSPACE);
}
dbp->db_nextblk = 0;
}
/*
* ready to add record
*/
(sizeof (mddb_block_t) * blkcnt);
if (dbp->db_firstentry) {
} else {
}
/*
* Optimized records have an owner node associated with them in
* a MN diskset. The owner is only set on a node that is actively
* writing to that record. The other nodes will show that record
* as having an invalid owner. The owner for an optimized record
* is used during fixoptrecord to determine which node should
* write out the record when the replicas associated with that
* optimized record have been changed.
*/
if (MD_MNSET_SETNO(s->s_setno)) {
}
switch (flag_type) {
case MD_CRO_OPTIMIZE:
break;
case MD_CRO_STRIPE:
break;
case MD_CRO_MIRROR:
break;
case MD_CRO_RAID:
break;
case MD_CRO_SOFTPART:
break;
case MD_CRO_TRANS_MASTER:
break;
case MD_CRO_TRANS_LOG:
break;
case MD_CRO_HOTSPARE:
break;
case MD_CRO_HOTSPARE_POOL:
break;
case MD_CRO_CHANGELOG:
break;
}
/*
* try to get all blocks consecutive. If not possible
* just get them one at a time
*/
for (i = 1; i < blkcnt; i++)
} else {
for (i = 0; i < blkcnt; i++)
}
/* Do we have to create an old style (32 bit) record? */
if (options & MD_CRO_32BIT) {
} else {
}
/* set de_rb_userdata for non optimization records */
if ((options & MD_CRO_OPTIMIZE) == 0) {
}
/* Generate the crc for this record */
/*
* the following code writes new records to all instances of
* the data base. Writing one block at a time to each instance
* is safe because they are not yet in a directory entry which
* has been written to the data base
*/
err = 0;
if ((options & MD_CRO_OPTIMIZE) == 0) {
for (i = 0; i < blkcnt; i++) {
tmppnt += MDDB_BSIZE;
}
} else {
if ((MD_MNSET_SETNO(s->s_setno)) &&
/*
* If a MN diskset then only master writes out newly
* created optimized record.
*/
}
}
/* Don't include opt resync and change log records in global XOR */
if (prevdbp) {
}
if (err) {
if (writeretry(s)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
}
mddb_setexit(s);
}
int
)
{
mddb_set_t *s;
int i;
#endif
if (checkstate(s, MDDB_PROBE)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
break;
}
break;
}
/*
* no such record
*/
ASSERT(s->s_staledeletes != 0);
s->s_staledeletes--;
mddb_setexit(s);
return (0);
}
}
if (dep->de_icreqsize)
else
}
for (i = 0; i < dep->de_blkcount; i++)
if (dep1)
else
if (writeretry(s)) {
/*
* staledelete is used to mark deletes which failed.
* its only use is to not panic when the user retries
* the delete once the database is active again
*/
s->s_staledeletes++;
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
}
mddb_setexit(s);
return (0);
}
)
{
mddb_set_t *s;
return (err);
if (searching) {
searching = 0;
} else {
mddb_setexit(s);
}
}
}
}
mddb_setexit(s);
if (searching)
return (MDDB_E_NORECORD);
return (0);
}
void *
)
{
mddb_set_t *s;
void *rval;
return (NULL);
continue;
if (dep->de_rb_userdata)
else
mddb_setexit(s);
return (rval);
}
}
mddb_setexit(s);
return (NULL);
}
)
{
mddb_set_t *s;
return (NULL);
continue;
mddb_setexit(s);
return (dep);
}
}
mddb_setexit(s);
return (NULL);
}
void *
)
{
mddb_set_t *s;
return (NULL);
continue;
if (dep->de_rb_userdata)
else
break;
}
break;
}
mddb_setexit(s);
return (NULL);
}
if (dep->de_rb_userdata) {
mddb_setexit(s);
return (rval);
}
} else {
/* LINTED variable unused - used for sizeof calculations */
icsize, MDDB_BSIZE);
"nonoptimized records can be resized\n");
}
mddb_setexit(s);
return (rval);
}
int
)
{
mddb_set_t *s;
int err = 0;
int private;
return (err);
continue;
mddb_setexit(s);
return (private);
}
}
mddb_setexit(s);
return (MDDB_E_NORECORD);
}
void
)
{
mddb_set_t *s;
ASSERT(0);
return;
}
continue;
mddb_setexit(s);
return;
}
}
mddb_setexit(s);
ASSERT(0);
}
)
{
mddb_set_t *s;
int err = 0;
return (err);
continue;
mddb_setexit(s);
return (rval);
}
}
mddb_setexit(s);
return (MDDB_E_NORECORD);
}
int
)
{
mddb_set_t *s;
int err = 0;
int rval;
return (err);
continue;
mddb_setexit(s);
return (rval);
}
}
mddb_setexit(s);
return (MDDB_E_NORECORD);
}
int
)
{
mddb_set_t *s;
int err = 0;
int rval;
return (err);
continue;
mddb_setexit(s);
return (rval);
}
}
mddb_setexit(s);
return (MDDB_E_NORECORD);
}
)
{
mddb_set_t *s;
int err = 0;
return ((mddb_recstatus_t)err);
break;
}
if (dep)
break;
}
if (! dep)
e_err = MDDB_NODATA;
e_err = MDDB_STALE;
mddb_setexit(s);
return (e_err);
}
/*
* Commit given record to disk.
* If committing an optimized record, do not call
* with md ioctl lock held.
*/
int
)
{
mddb_set_t *s;
static int err = 0;
int li;
int i, j;
int rval;
int hit_err = 0;
if (checkstate(s, MDDB_PROBE)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
mddb_setexit(s);
return (0);
}
break;
}
if (dep)
break;
}
mddb_setexit(s);
return (MDDB_E_NORECORD);
}
ids[1] = 0;
mddb_setexit(s);
return (mddb_commitrecs(ids));
}
/*
* following code allows multiple processes to be doing
* optimization commits in parallel.
* NOTE: if lots of optimization commits then the lock
* will not get released until it winds down
*/
if (s->s_optwaiterr) {
while (s->s_optwaiterr) {
s->s_opthungerr = 1;
}
if (checkstate(s, MDDB_PROBE)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
}
if (s->s_optcmtcnt++ == 0) {
s->s_opthavelck = 1;
if (s->s_optwantlck) {
cv_broadcast(&s->s_optwantlck_cv);
s->s_optwantlck = 0;
}
} else {
while (! s->s_opthavelck) {
s->s_optwantlck = 1;
}
}
break;
}
if (dep)
break;
}
if (! (--s->s_optcmtcnt)) {
s->s_opthavelck = 0;
}
mddb_setexit(s);
return (MDDB_E_NORECORD);
}
rbp->rb_commitcnt++;
/* Generate the crc for this record */
if (writeoptrecord(s, dep)) {
if (MD_MNSET_SETNO(s->s_setno)) {
hit_err = 1;
}
s->s_optwaiterr++;
}
if (MD_MNSET_SETNO(s->s_setno)) {
/* If last thread out, release single_thread_start */
if (! (--s->s_optcmtcnt)) {
s->s_opthavelck = 0;
}
/*
* If this thread had a writeoptrecords failure, then
* need to send message to master.
* But, multiple threads could all be running on the
* same single_thread_start, so serialize the threads
* by making each thread grab single_thread_start.
*
* After return from sending message to master message,
* replicas associated with optimized record will havei
* been changed (via a callback from the master to all
* nodes), so retry call to writeoptrecord.
* This code is replacing the call to writeretry that
* occurs for the local and traditional disksets.
*/
if (hit_err) {
/*
* If > 50% of replicas are alive then continue
* to send message to master until writeoptrecord
* succeeds. For now, assume that minor name,
* major number on this node is the same as on
* the master node. Once devids are turned on
* for MN disksets, can send devid.
*/
sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
while (!(md_get_setstatus(s->s_setno) &
MD_SET_TOOFEW)) {
sizeof (md_mn_msg_mddb_optrecerr_t));
for (i = 0; i < 2; i++) {
for (j = 0; j < MD_MNMAXSIDES; j++) {
mnslp =
break;
}
if (j == MD_MNMAXSIDES)
continue;
}
/* Release locks */
/*
* Send message to master about optimized
* record failure. After return, master
* should have marked failed replicas
* and sent parse message to slaves causing
* slaves to have fixed up the optimized
* record.
* On return from ksend_message, retry
* the write since this node should have fixed
* the optimized resync records it owns.
*/
(char *)msg_recerr,
sizeof (md_mn_msg_mddb_optrecerr_t),
kres);
"Unable to send optimized "
"resync record failure "
"message to other nodes in "
"diskset %s\n", s->s_setname);
"MD_MN_MSG_MDDB_OPTRECERR");
}
/* Regrab locks */
/* Start over in case mddb changed */
break;
}
if (dep)
break;
}
if (dep) {
rbp->rb_commitcnt++;
/* Generate the crc for this record */
/*
* If writeoptrecord succeeds, then
* break out.
*/
if (!(writeoptrecord(s, dep)))
break;
}
}
sizeof (md_mn_msg_mddb_optrecerr_t));
/* Resync record should be fixed - if possible */
s->s_optwaiterr--;
if (s->s_optwaiterr == 0) {
/* All errors have been handled */
if (s->s_opthungerr) {
s->s_opthungerr = 0;
cv_broadcast(&s->s_opthungerr_cv);
}
}
mddb_setexit(s);
return (MDDB_E_NOTNOW);
} else {
return (0);
}
}
} else {
/* If set is a traditional or local set */
if (! (--s->s_optcmtcnt)) {
err = 0;
if (s->s_optwaiterr) {
err = writeretry(s);
s->s_optwaiterr = 0;
if (s->s_opthungerr) {
s->s_opthungerr = 0;
cv_broadcast(&s->s_opthungerr_cv);
}
}
s->s_opthavelck = 0;
mddb_setexit(s);
if (err)
return (MDDB_E_NOTNOW);
return (0);
}
if (s->s_optwaiterr) {
while (s->s_optwaiterr) {
s->s_opthungerr = 1;
cv_wait(&s->s_opthungerr_cv,
}
if (checkstate(s, MDDB_NOPROBE)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
}
}
mddb_setexit(s);
return (0);
}
int
)
{
mddb_set_t *s;
int li;
int err = 0;
if (panicstr)
/*
* scan through and make sure ids are from the same set
*/
if (checkstate(s, MDDB_PROBE)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
err = 0;
if (! ids[0]) {
mddb_setexit(s);
return (0);
}
/*
* scan through and make sure ids all exist
*/
break;
}
break;
}
mddb_setexit(s);
return (MDDB_E_NORECORD);
}
}
/*
* scan through records fix commit counts and
* zero fiddles and update time stamp and rechecksum record
*/
checksum = 0;
while (*idp) {
break;
}
break;
}
/* Don't do fiddles for CHANGE LOG records */
rbp->rb_checksum_fiddle = 0;
}
rbp->rb_commitcnt++;
/* Generate the crc for this record */
/* Don't do fiddles for CHANGE LOG records */
}
idp++;
}
if (saverbp)
/*
* If this is a MN set but we are not the master, then we are not
* supposed to update the mddb on disk. So we finish at this point.
*/
mddb_setexit(s);
return (0);
}
continue;
while (*idp) {
break;
}
if (err)
break;
idp++;
}
if (err)
break;
}
if (err) {
if (writeretry(s)) {
mddb_setexit(s);
return (MDDB_E_NOTNOW);
}
}
mddb_setexit(s);
return (0);
}
)
{
}
)
{
}
char *
)
{
}
)
{
return (0);
}
int
)
{
return (1);
return (1);
return (0);
}
/*ARGSUSED*/
int
{
mddb_set_t *s;
int err = 0;
mdclrerror(ep);
return (0);
mddb_setexit(s);
return (0);
}
int
{
mddb_set_t *s;
int err = 0;
mdclrerror(ep);
/*
* This should be the only thing that prevents LOCAL sets from having
* mediators, at least in the kernel, userland needs to have some code
* written.
*/
if (setno == MD_LOCAL_SET)
return (0);
mddb_setexit(s);
return (0);
}
int
{
mddb_set_t *s;
int err = 0;
mdclrerror(ep);
return (0);
(void) upd_med(s, "updmed_ioctl()");
mddb_setexit(s);
return (0);
}
int
{
int err = 0;
int snarf_ok = 0;
return (0);
goto out;
snarf_ok = 1;
}
}
}
out:
/*
* In the case that the snarf failed, the diskset is
* left with s_db set, but s_lbp not set. The node is not
* an owner of the set and won't be allowed to release the
* diskset in order to cleanup. With s_db set, any call to the
* GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
* will cause the diskset to be loaded. So, cleanup the diskset so
* that an inadvertent start of the diskset doesn't happen later.
*/
}
return (err);
}
/*ARGSUSED*/
int
{
int err = 0;
/*
* Data integrity check
*/
/*
* Attempt to mark set as HOLD. If it is marked as HOLD, this means
* that the mirror code is currently searching all mirrors for a
* errored component that needs a hotspare. While this search is in
* progress, we cannot release the set and thgerefore we return EBUSY.
* Once we have set HOLD, the mirror function (check_4_hotspares) will
* block before the search until the set is released.
*/
if (md_holdset_testandenter(setno) != 0) {
return (EBUSY);
}
NODEV64);
}
return (err);
}
int
{
mddb_set_t *s;
int err = 0;
mdclrerror(ep);
return (0);
/*
* Data tags not supported on MN sets so return invalid operation.
* This ioctl could be called before the mddb has been read in so
* the set status may not yet be set to MNSET, so code following
* this check must handle a MN diskset properly.
*/
mddb_setexit(s);
}
/* s_dtlp is NULL for MN diskset */
sizeof (mddb_dtag_t));
break;
}
}
/* Walked the whole list and id not found, return error */
mddb_setexit(s);
}
mddb_setexit(s);
return (0);
}
int
{
mddb_set_t *s;
int err = 0;
mdclrerror(ep);
return (0);
/*
* Data tags not supported on MN sets so return invalid operation.
* This ioctl could be called before the mddb has been read in so
* the set status may not yet be set to MNSET, so code following
* this check must handle a MN diskset properly.
*/
mddb_setexit(s);
}
/* Validate and find the id requested - nothing found if MN diskset */
mddb_setexit(s);
}
/* Usetag is only valid when more than one tag exists */
if (dtl_cntl(s) < 2) {
mddb_setexit(s);
}
/* Put the selected tag in place */
/* Save the hint information */
mddb_setexit(s);
s = NULL;
/* shorthand */
/* Let unload know not to free the tag */
/* Release the set */
goto out;
err = 1;
goto out;
}
/* Re-init set using the saved mddb_config_t structure */
goto out;
}
}
/* use the saved rip structure */
/* Let the take code know a tag is being used */
mddb_setexit(s);
s = NULL;
/* Take the set */
goto out;
out:
if (trip)
if (s)
mddb_setexit(s);
return (err);
}
int
{
mddb_set_t *s;
int err = 0;
mdclrerror(ep);
return (0);
/*
* Data tags not supported on MN sets so return invalid operation.
* mddb is guaranteed to be incore at this point, so this
* check will catch all MN disksets.
*/
mddb_setexit(s);
}
/* Tag the data */
goto out;
}
/* If we had a BADTAG, it will be re-written, so clear the bit. */
goto out;
}
mddb_setexit(s);
s = NULL;
/* shorthand */
/* Clear the keeptag */
/* Release the set */
goto out;
goto out;
}
/* Re-init set using the saved mddb_config_t structure */
goto out;
}
}
/* Free the allocated rip structure */
/* use the saved rip structure */
/* Let the set init code know an accept is in progress */
mddb_setexit(s);
s = NULL;
/* Take the set */
goto out;
out:
if (trip)
if (s)
mddb_setexit(s);
return (err);
}
/*
* mddb_getinvlb_devid - cycles through the locator block and determines
* if the device id's for any of the replica disks are invalid.
* If so, it returns the diskname in the ctdptr.
* RETURN
* -1 Error
* cnt number of invalid device id's
*/
int
int count,
int size,
char **ctdptr
)
{
mddb_set_t *s;
int err = 0;
int li;
int len;
int cnt = 0;
char *cptr;
int i, dont_add_it;
char *tmpname;
return (-1);
}
mddb_setexit(s);
return (-1);
}
/* check for lb being devid style */
/* Only if devid exists and isn't valid */
/*
* if we count more invalid did's than
* was passed in there's an error somewhere
*/
mddb_setexit(s);
return (-1);
}
/*
* Future note: Need to do something here
* for the MN diskset case when device ids
* are supported in disksets.
* Can't add until merging devids_in_diskset
* code into code base.
*/
/*
* check to make sure length of device name is
* not greater than computed first time through
*/
mddb_setexit(s);
return (-1);
}
/* strip off slice part */
*tmpname = '\0';
dont_add_it = 0;
/* look to see if diskname is already in list */
for (i = 0; i < (cnt-1); i++) {
/* already there, don't add */
dont_add_it = 1;
break;
}
/* point to next diskname in list */
}
if (dont_add_it == 0) {
/* add diskname to list */
}
}
}
}
/* null terminate the list */
*cptr = '\0';
/*
* need to save the new pointer so that calling routine can continue
* to add information onto the end.
*/
mddb_setexit(s);
return (cnt);
}
/*
* mddb_validate_lb - count the number of lb's with invalid device id's. Keep
* track of length of longest devicename.
* RETURN
* -1 error
* cnt number of lb's with invalid devid's
*/
int
int *rmaxsz
)
{
mddb_set_t *s;
int err = 0;
int li;
int len;
int cnt = 0;
return (-1);
mddb_setexit(s);
return (-1);
}
/* lb must be in devid style */
goto mvl_out;
char *minor_name;
int get_rval;
continue;
/* Here we know, did exists but isn't valid */
} else {
cnt++;
/*
* Future note: Need to do something here
* for the MN diskset case when device ids
* are supported in disksets.
* Can't add until merging devids_in_diskset
* code into code base.
*/
}
}
if (push_lb(s) != 0)
cnt = -1;
mddb_setexit(s);
return (cnt);
}
int
{
mddb_set_t *s;
int li;
int active = 0;
/* there is nothing here..so we can unload */
return (0);
}
return (0);
}
active = 1;
break;
}
}
return (active);
}
/*
* regetoptrecord:
* --------------
* Update the in-core optimized resync record contents by re-reading the
* record from the on-disk metadb.
* The contents of the resync record will be overwritten by calling this
* routine. This means that callers that require the previous contents to
* be preserved must save the data before calling this routine.
*/
static void
mddb_set_t *s,
)
{
int li;
int i;
int err = 0;
#endif
for (i = 0; i < 2; i++) {
continue;
continue;
if (err)
continue;
continue;
continue;
/* Check the crc for this record */
continue;
}
break;
}
}
rbp->rb_private = 0;
return;
}
/* Generate the crc for this record */
}
/*
* mddb_reread_rr:
* Re-read the resync record from the on-disk copy. This is required for
* multi-node support so that a new mirror-owner can determine if a resync
* operation is required to guarantee data integrity.
*
* Arguments:
* setno Associated set
* id Resync record ID
*
* Return Value:
* 0 successful reread
* -1 invalid set (not multi-node or non-existant)
* >0 metadb state invalid
*/
int
)
{
mddb_set_t *s;
int err = 0;
return (-1);
return (-1);
mddb_setexit(s);
return (-1);
}
break;
}
regetoptrecord(s, dep);
err = 0;
} else {
err = -1;
}
mddb_setexit(s);
return (err);
}
/*
* Set owner associated with MN optimized resync record.
*
* Optimized records have an owner node associated with them in
* a MN diskset. The owner is only set on a node that is actively
* writing to that record. The other nodes will show that record
* as having an invalid owner. The owner for an optimized record
* is used during fixoptrecord to determine which node should
* write out the record when the replicas associated with that
* optimized record have been changed.
*
* Called directly from mirror driver and not from an ioctl.
*
* Returns
* NULL if successful.
* MDDB_E_NORECORD if record not found.
*/
int
)
{
mddb_set_t *s;
int found = 0;
return (MDDB_E_NORECORD);
return (MDDB_E_NORECORD);
continue;
found = 1;
break;
}
if (found)
break;
}
mddb_setexit(s);
if (!found) {
return (MDDB_E_NORECORD);
}
return (NULL);
}
/*
* mddb_parse re-reads portions of the mddb from disk given a list
* of good replicas to read from and flags describing
* which portion of the mddb to read in.
*
* Used in a MN diskset when the master has made a change to some part
* of the mddb and wants to relay this information to the slaves.
*/
int
{
mddb_set_t *s;
int err = 0;
int rval = 0;
int i, li;
int found_good_one = 0;
return (EINVAL);
return (0);
}
return (EINVAL);
}
/*
* Master node initiated this request, so there's no work for
* the master node to do.
*/
return (rval);
}
lbp = 0;
for (i = 0; i < MDDB_NLB; i++) {
/* Walk through master's active list */
continue;
if (s->s_mbiarray[i] == NULL)
continue;
/* Assumes master blocks are already setup */
}
if (err)
continue;
continue;
continue;
continue;
NULL))
continue;
continue;
/*
* a commit count of zero means this locator has
* been deleted
*/
if (lbp->lb_commitcnt == 0) {
continue;
}
/* Found a good locator - keep it */
found_good_one = 1;
break;
}
/*
* If found a good copy of the mddb, then read it into
* this node's locator block. Fix up the set's s_mbiarray
* pointer (master block incore array pointer) to be
* in sync with the newly read in locator block. If a
* new mddb was added, read in the master blocks associated
* with the new mddb. If an mddb was deleted, free the
* master blocks associated with deleted mddb.
*/
if (found_good_one) {
/* Compare old and new view of mddb locator blocks */
int mn_set;
/* If old and new views match, continue */
continue;
/*
* If new mddb has been added - delete
* old mbiarray and get new one.
*
* When devids are supported, will
* need to get dev from devid.
*/
if (s->s_mbiarray[li]) {
}
/*
* If getmasters fails, getmasters
* will set appropriate error flags.
*/
/*
* If old one has been deleted -
* delete old mbiarray.
*/
if (s->s_mbiarray[li]) {
}
}
}
/* Free this node's old view of mddb locator blocks */
} else {
if (lbp)
}
}
continue;
/* Successfully read the locator names */
if (readlocnames(s, li) == 0)
break;
}
/* Did not successfully read locnames; restore lnp */
} else {
/* readlocnames successful, free old struct */
}
}
int writeout;
/*
* Walk through directory block and directory entry incore
* linked list looking for optimized resync records.
* For each opt record found, re-read in directory block.
* The directoy block consists of a number of directory
* entries. The directory entry for this opt record will
* describe which 2 mddbs actually contain the resync record
* since it could have been relocated by the master node
* due to mddb failure or mddb deletion. If this node
* is the record owner for this opt record, then write out
* the record to the 2 mddbs listed in the directory entry
* if the mddbs locations are different than previously known.
*/
/* Found an opt record */
break;
}
/* If no opt records found, go to next dbp */
continue;
/*
* Reread directory block from disk since
* master could have rewritten in during fixoptrecord.
*/
KM_SLEEP);
continue;
if (err)
continue;
/* Reverify db; go to next mddb if bad */
db32p->db32_revision)) ||
MDDB_BSIZE, NULL))) {
continue;
} else {
break;
}
}
/*
* If all mddbs are unavailable then panic since
* this slave cannot be allowed to continue out-of-sync
* with the master node. Since the optimized resync
* records are written by all nodes, all nodes must
* stay in sync with the master.
*
* This also handles the case when all storage
* connectivity to a slave node has failed. The
* slave node will send an MDDB_OPTRECERR message to
* the master node when the slave node has been unable
* to write an optimized resync record to both
* designated mddbs. After the master has fixed the
* optimized records to be on available mddbs, the
* MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
* is sent to all slave nodes. If a slave node is
* unable to access any mddb in order to read in the
* relocated optimized resync record, then the slave
* node must panic.
*/
"access any SVM state database "
"replicas for diskset %s\n",
s->s_setname);
}
/*
* Setup temp copy of linked list of de's.
* Already have an incore copy, but need to walk
* the directory entry list contained in the
* new directory block that was just read in above.
* After finding the directory entry of an opt record
* by walking the incore list, find the corresponding
* entry in the temporary list and then update
* the incore directory entry record with
* the (possibly changed) mddb location stored
* for the optimized resync records.
*/
de32p = (mddb_de32_t *)
((void *) ((caddr_t)
(&db32p->db32_firstentry)
+ sizeof (db32p->db32_firstentry)));
tdep = (mddb_de_ic_t *)
kmem_zalloc(sizeof (mddb_de_ic_t) -
sizeof (mddb_block_t) +
sizeof (mddb_block_t) *
sizeof (mddb_de_ic_t) -
sizeof (mddb_block_t) +
sizeof (mddb_block_t) *
}
/* Now, walk the incore directory entry list */
continue;
/*
* Found an opt record in the incore copy.
* Find the corresponding entry in the temp
* list. If anything has changed in the
* opt record info between the incore copy
* and the temp copy, update the incore copy
* and set a flag to writeout the opt record
* to the new mddb locations.
*/
writeout = 0;
/* Check first mddb location */
dep->de_optinfo[0] =
tdep->de_optinfo[0];
writeout = 1;
}
/* Check second mddb location */
writeout = 1;
}
/* Record owner should rewrite it */
if ((writeout) &&
(dep->de_owner_nodeid ==
s_nodeid)) {
(void) writeoptrecord(s,
dep);
}
break;
}
}
}
/*
* Update the incore checksum information for this
* directory block to match the newly read in checksum.
* This should have only changed if the incore and
* temp directory entries differed, but it takes
* more code to do the check than to just update
* the information everytime.
*/
/* Now free everything */
while (tdep) {
}
}
rval = 0;
}
out:
return (rval);
}
int
{
mddb_set_t *s;
int err = 0;
return (EINVAL);
/*
* If the new_master flag is set for this setno we are in the middle
* of a reconfig cycle, and blocking or unblocking is not needed.
* Hence we can return success immediately
*/
return (0);
}
return (0);
}
return (EINVAL);
}
return (err);
}
/*
* mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
* to relocate any optimized resync records to available mddbs.
* This routine is only called on the master node.
*
* Used in a MN diskset when a slave node has failed to write an optimized
* resync record. The failed mddb information is sent to the master node
* so the master can relocate the optimized records, if possible. If the
* failed mddb information has a mddb marked as failed that was previously
* marked active on the master, the master sets its incore mddb state to
* EWRITE and sets the PARSE_LOCBLK flag. The master node then attempts
* to relocate any optimized records on the newly failed mddbs by calling
* fixoptrecords. (fixoptrecords will set the PARSE_OPTRECS flag if any
* optimized records are relocated.)
*
* When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
* flags and will send a PARSE message to the slave nodes. The PARSE_LOCBLK
* flag causes the slave node to re-read in the locator block from disk.
* The PARSE_OPTRECS flag causes the slave node to re-read in the directory
* blocks and write out any optimized resync records that have been
* relocated to a different mddb.
*/
int
{
mddb_set_t *s;
int err = 0;
int li;
int i, j;
int something_changed = 0;
int setno;
return (EINVAL);
return (0);
}
mddb_setexit(s);
return (EINVAL);
}
/*
* If slave node has seen an mddb failure, but the master node
* hasn't encountered this failure, mark the mddb as failed on
* the master node and set the something_changed flag to 1.
*/
for (i = 0; i < 2; i++) {
for (j = 0; j < MD_MNMAXSIDES; j++) {
break;
}
/* Do quick check using li */
if (j != MD_MNMAXSIDES)
if ((j != MD_MNMAXSIDES) &&
MD_MAXDRVNM) == 0) &&
something_changed = 1;
}
} else {
/*
* Passed in li from slave does not match
* the replica in the master's structures.
* This could have occurred if a delete
* mddb command was running when the
* optimized resync record had a failure.
* Search all replicas for this entry.
* If no match, just ignore.
* If a match, set replica in error.
*/
continue;
for (j = 0; j < MD_MNMAXSIDES; j++) {
mnslp =
break;
}
if (j == MD_MNMAXSIDES)
continue;
MD_MAXDRVNM) == 0) &&
== 0)) {
something_changed = 1;
}
break;
}
}
}
}
}
/*
* If this message changed nothing, then we're done since this
* failure has already been handled.
* If some mddb state has been changed, send a parse message to
* the slave nodes so that the slaves will re-read the locator
* block from disk.
*/
if (something_changed == 0) {
mddb_setexit(s);
return (0);
} else {
s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
}
/*
* Scan replicas setting MD_SET_TOOFEW if
* 50% or more of the mddbs have seen errors.
* Note: Don't call selectreplicas or writeretry
* since these routines may end up setting the ACTIVE flag
* on a failed mddb if the master is able to access the mddb
* but the slave node couldn't. Need to have the ACTIVE flag
* turned off in order to relocate the optimized records to
* mddbs that are (hopefully) available on all nodes.
*/
alc = 0;
lc = 0;
continue;
lc++;
continue;
alc++;
}
/*
* If more than 50% mddbs have failed, then don't relocate opt recs.
* The node sending the mddb failure information will detect TOOFEW
* and will panic when it attempts to re-write the optimized record.
*/
(void) push_lb(s);
mddb_setexit(s);
return (0);
}
/* Attempt to relocate optimized records that are on failed mddbs */
(void) fixoptrecords(s);
/* Push changed locator block out to disk */
(void) push_lb(s);
/* Recheck for TOOFEW after writing out locator blocks */
alc = 0;
lc = 0;
continue;
lc++;
continue;
alc++;
}
/* If more than 50% mddbs have failed, then don't relocate opt recs */
mddb_setexit(s);
return (0);
}
mddb_setexit(s);
return (0);
}
/*
* Check if incore mddb on master node matches ondisk mddb.
* If not, master writes out incore view to all mddbs.
* Have previously verified that master is an owner of the
* diskset (master has snarfed diskset) and that diskset is
* not stale.
*
* Meant to be called during reconfig cycle during change of master.
* Previous master in diskset may have changed the mddb and
* panic'd before relaying information to slave nodes. New
* master node just writes out its incore view of the mddb and
* the replay of the change log will resync all the nodes.
*
* Only supported for MN disksets.
*
* Return values:
* 0 - success
* non-zero - failure
*/
int
{
int err = 0;
mddb_set_t *s;
int li;
int write_out_mddb;
int mddb_err = 0;
int prev_li = 0;
int rval = 0;
int mddbs_present = 0;
/* Verify that setno is in valid range */
return (EINVAL);
return (0);
}
/* Calling diskset must be a MN diskset */
if (!(MD_MNSET_SETNO(setno))) {
mddb_setexit(s);
return (EINVAL);
}
/* Re-verify that set is not stale */
mddb_setexit(s);
}
/*
* Previous master could have died during the write of data to
* the mddbs so that the ondisk mddbs may not be consistent.
* So, need to check the contents of the first and last active mddb
* to see if the mddbs need to be rewritten.
*/
int checkcopy_err;
/* Find replica that is active */
continue;
mddbs_present = 1;
continue;
continue;
/* Check locator block */
KM_SLEEP);
/* read in on-disk locator block */
/* If err, try next mddb */
if (err) {
continue;
}
/*
* We resnarf all changelog entries for this set.
* They may have been altered by the previous master
*/
continue;
}
/* This has been alloc'ed while joining the set */
}
if (dep->de_rb_userdata) {
}
if (err) {
/*
* When we see on error while reading the
* changelog entries, we move on to the next
* mddb
*/
err = 1;
break; /* out of inner for-loop */
}
}
if (err)
break; /* out of outer for-loop */
}
/* If err, try next mddb */
if (err) {
continue;
}
/* Is incore locator block same as ondisk? */
== 1) {
write_out_mddb = 1;
break;
}
/* If lb ok, check locator names */
KM_SLEEP);
/* read in on-disk locator names */
/* If err, try next mddb */
if (err) {
continue;
}
/* Are incore locator names same as ondisk? */
== 1) {
write_out_mddb = 1;
break;
}
/*
* Check records in mddb.
* If a read error is encountered, set the error flag and
* continue to the next mddb. Otherwise, if incore data is
* different from ondisk, then set the flag to write out
* the mddb and break out.
*/
if (checkcopy_err == MDDB_F_EREAD) {
mddb_err = 1;
continue;
} else if (checkcopy_err == 1) {
write_out_mddb = 1;
break;
}
/*
* Have found first active mddb and the data is the same as
* incore - break out of loop
*/
write_out_mddb = 0;
break;
}
/*
* Skip checking for last active mddb if:
* - already found a mismatch in the first active mddb
* (write_out_mddb is 1) OR
* - didn't find a readable mddb when looking for first
* active mddb (there are mddbs present but all failed
* when read was attempted).
*
* In either case, go to write_out_mddb label in order to attempt
* to write out the data. If < 50% mddbs are available, panic.
*/
if ((write_out_mddb == 1) ||
write_out_mddb = 1;
goto write_out_mddb;
}
/*
* Save which index was checked for the first active mddb. If only 1
* active mddb, don't want to recheck the same mddb when looking for
* last active mddb.
*/
/*
* Now, checking for last active mddb. If found same index as before
* (only 1 active mddb), then skip.
*/
int checkcopy_err;
/* Find replica that is active */
continue;
continue;
continue;
/* If already checked mddb, bail out */
break;
/* Check locator block */
KM_SLEEP);
/* read in on-disk locator block */
/* If err, try next mddb */
if (err) {
continue;
}
/* Is incore locator block same as ondisk? */
== 1) {
write_out_mddb = 1;
break;
}
/* If lb ok, check locator names */
mnlnp_od = (mddb_mnln_t *)
/* read in on-disk locator names */
/* If err, try next mddb */
if (err) {
continue;
}
/* Are incore locator names same as ondisk? */
== 1) {
write_out_mddb = 1;
break;
}
/*
* Check records in mddb.
* If a read error is encountered, set the error flag and
* continue to the next mddb. Otherwise, if incore data is
* different from ondisk, then set the flag to write out
* the mddb and break out.
*/
if (checkcopy_err == MDDB_F_EREAD) {
mddb_err = 1;
continue;
} else if (checkcopy_err == 1) {
write_out_mddb = 1;
break;
}
/*
* Have found last active mddb and the data is the same as
* incore - break out of loop
*/
write_out_mddb = 0;
break;
}
/*
* If ondisk and incore versions of the mddb don't match, then
* write out this node's incore version to disk.
* Or, if unable to read a copy of the mddb, attempt to write
* out a new one.
*/
if (write_out_mddb) {
/* Recompute free blocks based on incore information */
computefreeblks(s); /* set up free block bits */
/*
* Write directory entries and record blocks.
* Use flag MDDB_WRITECOPY_SYNC so that writecopy
* routine won't write out change log records.
*/
/* Don't write to inactive or deleted mddbs */
continue;
continue;
continue;
/* If encounter a write error, save it for later */
mddb_err = 1;
}
}
/*
* Write out locator blocks to all replicas.
* push_lb will set MDDB_F_EWRITE on replicas that fail.
*/
if (push_lb(s))
mddb_err = 1;
/* Write out locator names to all replicas */
/* writeall sets MDDB_F_EWRITE if writes fails to replica */
lbp->lb_lnblkcnt, 0))
mddb_err = 1;
/*
* The writes to the replicas above would have set
* the MDDB_F_EWRITE flags if any write error was
* encountered.
* If < 50% of the mddbs are available, panic.
*/
continue;
lc++;
/*
* If mddb:
* - is not active (previously had an error)
* - had an error reading the master blocks or
* - had an error in writing to the mddb
* then don't count this mddb in the active count.
*/
continue;
alc++;
}
"md: Panic due to lack of DiskSuite state\n"
" database replicas. Fewer than 50%% of "
"the total were available,\n so panic to "
"ensure data integrity.");
}
}
/*
* If encountered an error during checking or writing of
* mddbs, call selectreplicas so that replica error can
* be properly handled. This will involve another attempt
* to write the mddb out to any mddb marked MDDB_F_EWRITE.
* If mddb still fails, it will have the MDDB_F_ACTIVE bit
* turned off. Set the MDDB_SCANALLSYNC flag so that
* selectreplicas doesn't overwrite the change log entries.
*
* Set the PARSE_LOCBLK flag in the mddb_set structure to show
* that the locator block has been changed.
*/
if (mddb_err) {
(void) selectreplicas(s, MDDB_SCANALLSYNC);
s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
}
mddb_setexit(s);
return (rval);
}
/*
* Used during reconfig cycle
* Only supported for MN disksets.
*
* Return values:
* 0 - success
* non-zero - failure
*/
int
{
/* Verify that setno is in valid range */
return (EINVAL);
/*
* When setting the flags, the set may not
* be snarfed yet. So, don't check for SNARFED or MNset
* and don't call mddb_setenter.
* In order to discourage bad ioctl calls,
* verify that magic field in structure is set correctly.
*/
return (EINVAL);
case MDDB_NM_SET:
break;
case MDDB_NM_RESET:
break;
case MDDB_NM_GET:
break;
}
return (0);
}
int
)
{
struct nm_next_hdr *nh;
struct nm_name *n;
char *shn;
int retval = 1;
/*
* Load the devid name space if it exists
*/
/*
* Unload the devid namespace
*/
return (0);
}
retval = 0;
goto out;
}
/*
* Look up the key
*/
/*
* Find the entry, update its n_minor if metadevice
*/
== NULL) {
retval = 0;
goto out;
}
}
}
out:
return (retval);
}
static void
mddb_set_t *s
)
{
struct nm_rec_hdr *hdr;
case MDDB_NM_HDR:
case MDDB_DID_NM_HDR:
if (hdr->r_next_recid > 0) {
}
if (hdr->r_next_recid > 0) {
}
break;
case MDDB_NM:
case MDDB_DID_NM:
case MDDB_SHR_NM:
case MDDB_DID_SHR_NM:
hdr = (struct nm_rec_hdr *)
if (hdr->r_next_recid > 0) {
}
break;
default:
break;
}
}
}
}
static int
mddb_set_t *s
)
{
ids[1] = 0;
if (mddb_commitrecs(ids)) {
return (MDDB_E_NORECORD);
}
}
}
}
return (0);
}
static int
mddb_set_t *s
)
{
int err = 0;
if (md_get_setstatus(s->s_setno) &
/*
* It is a replicated set
*/
return (-1);
}
} else {
/*
* It is a non-replicated set
* and there is no need to update
* devid
*/
}
if (err)
return (err);
}
return (0);
}
static int
)
{
struct nm_next_hdr *nh;
char *shrname;
int len;
int err = 0;
/* Import setname */
goto out;
}
/*
* No metadevice is okay
*/
err = 0;
goto out;
}
/*
* We have it, go ahead and update the namespace.
*/
NM_NOCOMMIT)) {
goto out;
}
goto out;
}
ids[2] = 0;
out:
if (shrname)
return (err);
}
static int
)
{
mddb_set_t *s;
int err = 0;
return (err);
}
/* Update dt */
}
mddb_setexit(s);
return (err);
}
/* Update lb */
if ((err = writelocall(s)) != 0) {
mddb_setexit(s);
return (err);
}
/* Update mb */
mddb_setexit(s);
return (err);
}
mddb_setexit(s);
/* Update db records */
if ((err = update_db_rec(s)) != 0)
return (err);
/* Update setname embedded in the namespace */
return (err);
}
static void
)
{
if (sr->sr_driverec == 0) {
return;
}
drv->dr_nextrec != 0;
;
}
static void
mddb_recid_t **ids,
)
{
int cnt;
cnt = 1;
/* CSTYLED */
if (drv->dr_nextrec != 0)
(drv->dr_nextrec);
else
}
}
static int
)
{
mddb_set_t *s;
return (err);
/* Create and fill in set record */
MD_CRO_32BIT, MD_LOCAL_SET)) < 0) {
mddb_setexit(s);
return (MDDB_E_INVALID);
}
/* Create and fillin drive records */
/*
* Add entry and create the record
*/
continue;
if (dr_key < 0) {
mddb_setexit(s);
return (MD_KEYBAD);
}
mddb_setexit(s);
return (MDDB_E_INVALID);
}
/*
* We need to check to see if the drive on
* the rip has a replica. If it doesn't have
* a replica, then we need to set the dr_dbcnt
* and dr_dbsize to 0 to reflect that.
*/
} else {
continue;
== 0))
}
}
drc++;
/* Add on the linked list */
}
/*
* Alloc and setup recids which include set record
*/
/*
* Commit all the records
*/
if (ids)
mddb_setexit(s);
return (err);
}
/*
* namespace is loaded before this is called.
* The purpose of this function is to update the device ids in the entire
* namespace using the data in the ri structure. Compare the devid found in
* the namespace with ri_old_devid and if they are the same, update with the
* devid in ri_devid.
*/
static int
{
struct nm_next_hdr *nh;
struct did_min_name *n;
struct nm_next_hdr *did_shr_nh;
struct did_shr_name *shr_n;
struct did_shr_name *shn;
struct nm_next_hdr *this_did_shr_nh;
/*
* It is okay if we dont have any configuration
*/
== NULL) {
return (0);
}
/* check out every entry in the namespace */
break;
} else {
if (did_shr_nh == NULL) {
return (ENOENT);
}
did_shr_nh, n->min_devid_key, (char *)0,
return (ENOENT);
}
/* find this devid in the incore replica */
== 0) {
/*
* found the corresponding entry
* update with new devid
*/
/* first remove old devid info */
size = ((struct nm_rec_hdr *)
if (size == 0) {
} else {
}
((struct nm_rec_hdr *)this_did_shr_nh->
nmn_record)->r_used_size -=
/* add in new devid info */
if ((shn = (struct did_shr_name *)
return (ENOMEM);
}
recids[2] = 0;
}
}
}
}
return (0);
}
/*ARGSUSED*/
int
int mode
)
{
mddb_set_t *s;
int i, err = 0;
return (EINVAL);
}
goto out;
}
/* Set the bit first otherwise load_old_replicas can fail */
goto out;
}
/*
* Upon completion of load_old_replicas, the old setno is
* restored from the disk so we need to reset
*/
/*
* Fixup the NM records before loading namespace
*/
(void) md_imp_nm(s);
mddb_setexit(s);
/*
* Load the devid name space if it exists
* and ask each module to fixup unit records
*/
goto cleanup;
}
goto cleanup;
}
do {
i = 0;
} while (i);
/*
* Fixup
* (1) locator block
* (2) locator name block if necessary
* (3) master block
* (4) directory block
* calls appropriate writes to push changes out
*/
goto cleanup;
/*
* Create set in MD_LOCAL_SET
*/
goto cleanup;
/*
* update the namespace device ids if necessary (ie. block copy disk)
*/
if ((err = md_imp_update_namespace_did(s)) != 0) {
goto cleanup;
}
}
/*
* Halt the set
*/
/*
* Unload the namespace for the imported set
*/
out:
return (err);
}
#endif /* MDDB_FAKE */