/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2011 Bayard G. Bell. All rights reserved.
*/
/*
* NAME: raid.c
*
* DESCRIPTION: Main RAID driver source file containing open, close and I/O
* operations.
*
* ROUTINES PROVIDED FOR EXTERNAL USE:
* raid_open() - open the RAID metadevice for access.
* raid_internal_open() - internal open routine of RAID metdevice.
* md_raid_strategy() - perform normal I/O operations,
* such as read and write.
* raid_close() - close the RAID metadevice.
* raid_internal_close() - internal close routine of RAID metadevice.
* raid_snarf() - initialize and clean up MDD records.
* raid_halt() - reset the RAID metadevice
* raid_line() - return the line # of this segment
* raid_dcolumn() - return the data column # of this segment
* raid_pcolumn() - return the parity column # of this segment
*/
#include <sys/sysmacros.h>
#ifndef lint
#endif /* lint */
extern int md_status;
extern mdq_anchor_t md_done_daemon;
extern mdq_anchor_t md_mstr_daemon;
extern int md_sleep_for_test;
extern md_event_queue_t *md_event_queue;
int raid_total_io = 0;
int raid_reads = 0;
int raid_writes = 0;
int raid_no_bpmaps = 0;
int raid_512 = 0;
int raid_1024 = 0;
int raid_1024_8192 = 0;
int raid_8192 = 0;
int raid_8192_bigger = 0;
int raid_line_lock_wait = 0;
int data_buffer_waits = 0;
int parity_buffer_waits = 0;
/* writer line locks */
/* read line locks */
/* prewrite stats */
int raid_pw_invalidates = 0;
int md_oflags);
static rus_state_t
{
switch (state) {
case RCS_INIT:
return (RUS_INIT);
case RCS_OKAY:
return (RUS_OKAY);
case RCS_RESYNC:
if (unitstate & RUS_LAST_ERRED)
return (RUS_LAST_ERRED);
else
return (RUS_ERRED);
case RCS_ERRED:
return (RUS_ERRED);
case RCS_LAST_ERRED:
return (RUS_ERRED);
default:
break;
}
panic("raid_col2unit");
/*NOTREACHED*/
}
void
{
int i;
char *devname;
RCS_LAST_ERRED | RCS_REGEN));
== 0);
if (force) {
return;
}
RUS_REGEN));
return;
return;
return;
}
/*
* if there is another column in the error state then this
* column should go to the last errored state
*/
for (i = 0; i < un->un_totalcolumncnt; i++) {
if (i == col)
else
errcnt++;
okaycnt++;
if (colstate & RCS_RESYNC)
resynccnt++;
}
else if (errcnt > 1) {
} else if (errcnt == 1)
if (!(unitstate & RUS_LAST_ERRED))
/*
* if there are last errored column being brought back online
* by open or snarf, then be sure to clear the RUS_LAST_ERRED
* bit to allow writes. If there is a real error then the
* column will go back into last erred.
*/
if (unitstate & RUS_LAST_ERRED) {
/*
* Close the broken device and clear the open flag on
* it. We have to check that the device is open,
* otherwise the first open on it has resulted in the
* error that is being processed and the actual un_dev
* will be NODEV64.
*/
}
/*
* Similar to logic above except no log messages since we
* are just transitioning from Last Erred to Erred.
*/
}
/*
* If a resync has completed, see if there is a Last Erred
* component that we can change to the Erred state.
*/
for (i = 0; i < un->un_totalcolumncnt; i++) {
if (i != col &&
break;
}
}
}
}
/*
* NAME: erred_check_line
*
* DESCRIPTION: Return the type of write to perform on an erred column based
* upon any resync activity.
*
* if a column is being resynced and the write is above the
* resync point may have to write to the target being resynced.
*
* Column state may make it impossible to do the write
* in which case RCL_EIO or RCL_ENXIO is returned.
*
* If a column cannot be written directly, RCL_ERRED is
* returned and processing should proceed accordingly.
*
* PARAMETERS: minor_t mnum - minor number identity of metadevice
* md_raidcs_t *cs - child save structure
* mr_column_t *dcolumn - pointer to data column structure
* mr_column_t *pcolumn - pointer to parity column structure
*
* RETURNS: RCL_OKAY, RCL_ERRED
*
* LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held
* across call.
*/
static int
{
return (RCL_OKAY);
return (RCL_ERRED); /* do not read from errored disk */
/*
* for the last errored case their are two considerations.
* When the last errored column is the only errored column then
* do treat it like a maintenance column, not doing I/O from
* it. When it there are other failures then just attempt
* to use it.
*/
return (RCL_ERRED);
/*
* When a resync from a hotspare is being done (copy resync)
* then always treat it as an OKAY column, since no regen
* is required.
*/
return (RCL_OKAY);
}
return (RCL_OKAY);
}
return (RCL_ERRED);
}
/*
* NAMES: raid_state_cnt
*
* DESCRIPTION: counts number of column in a specific state
*
* PARAMETERS: md_raid_t *un
* rcs_state state
*/
int
{
int i, retval = 0;
for (i = 0; i < un->un_totalcolumncnt; i++)
retval++;
return (retval);
}
/*
* NAMES: raid_io_overlaps
*
* DESCRIPTION: checkst for overlap of 2 child save structures
*
* PARAMETERS: md_raidcs_t cs1
* md_raidcs_t cs2
*
* RETURNS: 0 - no overlap
* 1 - overlap
*/
int
{
return (0);
return (0);
return (1);
}
/*
* NAMES: raid_parent_constructor
* DESCRIPTION: parent structure constructor routine
* PARAMETERS:
*/
/*ARGSUSED1*/
static int
{
return (0);
}
void
{
}
/*ARGSUSED1*/
static void
raid_parent_destructor(void *p, void *d)
{
}
/*
* NAMES: raid_child_constructor
* DESCRIPTION: child structure constructor routine
* PARAMETERS:
*/
/*ARGSUSED1*/
static int
{
return (0);
}
void
{
}
/*ARGSUSED1*/
static void
raid_child_destructor(void *p, void *d)
{
}
/*ARGSUSED1*/
static int
{
return (0);
}
static void
{
}
/*ARGSUSED1*/
static void
raid_cbuf_destructor(void *p, void *d)
{
}
/*
* NAMES: raid_run_queue
* DESCRIPTION: spawn a backend processing daemon for RAID metadevice.
* PARAMETERS:
*/
/*ARGSUSED*/
static void
raid_run_queue(void *d)
{
if (!(md_status & MD_GBL_DAEMONS_LIVE))
}
/*
* NAME: raid_build_pwslot
* DESCRIPTION: builds mr_pw_reserve for the column
* PARAMETERS: un is the pointer to the unit structure
* colindex is the column to create the structure for
*/
int
{
int i;
sb[i].sb_start_blk = 0;
sb[i].sb_last_blk = 0;
}
return (0);
}
/*
* NAME: raid_free_pw_reservation
* DESCRIPTION: RAID metadevice pre-write slot structure destroy routine
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* int colindex - index of the column whose pre-write slot struct
* is to be destroyed.
*/
void
{
}
/*
* NAME: raid_cancel_pwslot
* DESCRIPTION: RAID metadevice write routine
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
int broadcast = 0;
return;
broadcast++;
}
broadcast++;
}
continue;
broadcast++;
}
if (broadcast) {
return;
}
}
static void
{
int broadcast = 0;
while (cbuf) {
broadcast++;
cbuf_to_free = cbuf;
}
/*
* now that there is a free prewrite slot, check to see if there
* are any io operations waiting first wake up the raid_io_startup
* then signal the the processes waiting in raid_write.
*/
if (broadcast) {
return;
}
}
static int
{
int i;
int flags;
/* start with the data column */
for (i = 0; i < pwcnt; i++) {
if (flags & SB_INVAL_PEND)
continue;
avail = i;
continue;
/* OVERLAP */
/*
* raid_invalidate_pwslot attempts to zero out prewrite entry
* transaction. however cs_frags accounting for this case is
* broken because raid_write_io resets cs_frags i.e. ignoring
* that it could have been been set to > 0 value by
* raid_invalidate_pwslot. While this can be fixed an
* additional problem is that we don't seem to handle
* correctly the case of getting a disk error for prewrite
* entry invalidation.
* It does not look like we really need
* to invalidate prewrite slots because raid_replay sorts
* prewrite id's in ascending order and during recovery the
* latest prewrite entry for the same block will be replay
* last. That's why i ifdef'd out the call to
* raid_invalidate_pwslot. --aguzovsk@east
*/
if (use == -1) {
use = i;
}
}
if (use == -1)
return (use);
}
static int
{
int i;
/*
* check to be sure there is a prewrite slot available
* if not just return.
*/
for (i = 0; i < un->un_totalcolumncnt; i++)
return (1);
return (0);
}
return (1);
return (1);
return (0);
}
static int
{
if (raid_check_pw(cs))
return (1);
}
return (0);
}
/*
* NAMES: raid_build_incore
* DESCRIPTION: RAID metadevice incore structure building routine
* PARAMETERS: void *p - pointer to a unit structure
* int snarfing - a flag to indicate snarfing is required
*/
int
{
int i;
int preserve_flags;
int iosize;
/* clear out bogus pointer incase we return(1) prior to alloc */
return (1);
}
return (0);
if (snarfing)
KM_SLEEP);
kmem_zalloc(sizeof (mr_column_ic_t) *
for (i = 0; i < un->un_totalcolumncnt; i++) {
column->un_devflags &=
if (raid_build_pw_reservation(un, i) != 0) {
/* could not build pwslot */
return (1);
}
if (snarfing) {
/*
* Comment out instead of remove so we have history
* In the pre-SVM releases stored devt is used so
* as long as there is one snarf is always happy
* even the component is powered off. This is not
* the case in current SVM implementation. NODEV64
* can be returned and in this case since we resolve
* the devt at 'open' time (first use of metadevice)
* we will allow snarf continue.
*
* if (dev == NODEV64)
* return (1);
*/
/*
* Setup un_orig_dev from device id info if the device
* is valid (not NODEV64).
*/
resync_cnt++;
error_cnt++;
(void) md_hot_spare_ifc(HS_MKDEV,
/*
* Same here
*
* if (hs == NODEV64)
* return (1);
*/
}
if (column->un_devstate &
(RCS_OKAY | RCS_LAST_ERRED)) {
column->un_pwstart =
/*
* if previous system was 4.0 set
* the direction flags
*/
if ((preserve_flags &
MD_RAID_REGEN_RESYNC)) == 0) {
if (column->un_alt_dev !=
else
/* CSTYLED */
}
}
} else { /* no hot spares */
}
}
}
column->un_alt_pwstart = 0;
column->un_alt_devstart = 0;
un->un_resync_line_index = 0;
un->un_resync_index = 0;
un->un_percent_done = 0;
}
}
if (resync_cnt && error_cnt) {
for (i = 0; i < un->un_totalcolumncnt; i++) {
/* hotspare has data */
continue;
/* hotspare does not have data */
}
}
}
/* place various information in the in-core data structures */
return (0);
}
/*
* NAMES: reset_raid
* DESCRIPTION: RAID metadevice reset routine
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* minor_t mnum - RAID metadevice minor number
* int removing - a flag to imply removing device name from
* MDDB database.
*/
void
{
int i, n = 0;
int hserr;
if (un->un_pbuffer) {
}
if (un->un_dbuffer) {
}
/* free all pre-write slots created during build incore */
for (i = 0; i < un->un_totalcolumncnt; i++)
/*
* Attempt release of its minor node
*/
if (!removing)
return;
KM_SLEEP);
recids = (mddb_recid_t *)
for (i = 0; i < column_cnt; i++) {
else
}
/*
* deparent any metadevices.
* NOTE: currently soft partitions are the only metadevices
* allowed in RAID metadevices.
*/
}
}
/* decrement the reference count of the old hsp */
recids[n] = 0;
/*
* Remove self from the namespace
*/
}
/* Remove the unit structure */
/* Remove the vtoc, if present */
if (vtoc_id)
}
/*
* NAMES: raid_error_parent
* DESCRIPTION: mark a parent structure in error
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
* int error - error value to set
* NOTE: (TBR) - this routine currently is not in use.
*/
static void
{
}
/*
* The following defines tell raid_free_parent
* RFP_RLS_LOCK release the unit reader lock when done.
* RFP_DECR_PWFRAGS decrement ps_pwfrags
* RFP_DECR_FRAGS decrement ps_frags
* RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep
*/
/*
* NAMES: raid_free_parent
* DESCRIPTION: free a parent structure
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
* int todo - indicates what needs to be done
*/
static void
{
if (todo & RFP_DECR_PWFRAGS) {
ps->ps_pwfrags--;
}
}
}
if (todo & RFP_DECR_FRAGS) {
}
return;
}
if (todo & RFP_RLS_LOCK)
if (panicstr) {
return;
}
(void) raid_hotspares();
}
/*
* NAMES: raid_free_child
* DESCRIPTION: free a parent structure
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
* int drop_locks - 0 for no locks held
* NOTE: (TBR) - this routine currently is not in use.
*/
static void
{
if (cs->cs_pw_inval_list)
if (drop_locks) {
} else {
}
while (cbuf) {
}
}
/*
* NAME: raid_regen_parity
*
* DESCRIPTION: This routine is used to regenerate the parity blocks
* for the entire raid device. It is called from
* both the regen thread and the IO path.
*
* On error the entire device is marked as in error by
* placing the erroring device in error and all other
* devices in last_errored.
*
* PARAMETERS: md_raidcs_t *cs
*/
void
{
int column;
int j;
/*
* This routine uses the data and parity buffers allocated to a
* write. In the case of a read the buffers are allocated and
* freed at the end.
*/
return;
getpbuffer(cs);
getdbuffer(cs);
}
if (column == parity_column)
continue;
goto bail;
pbuf++;
dbuf++;
}
}
goto bail;
}
return;
bail:
}
}
/*
* NAMES: raid_error_state
* DESCRIPTION: check unit and column states' impact on I/O error
* NOTE: the state now may not be the state when the
* I/O completed due to race conditions.
* PARAMETERS: mr_unit_t *un - pointer to raid unit structure
* md_raidcs_t *cs - pointer to child structure
* buf_t *bp - pointer to buffer structure
*/
static int
{
int column;
int i;
column = -1;
for (i = 0; i < un->un_totalcolumncnt; i++) {
column = i;
break;
}
column = i;
break;
}
}
/* in case a replace snuck in while waiting on unit writer lock */
if (column == -1) {
return (0);
}
}
return (EIO);
}
/*
* NAME: raid_mapin_buf
* DESCRIPTION: wait for the input buffer header to be maped in
* PARAMETERS: md_raidps_t *ps
*/
static void
{
/*
* check to see if the buffer is maped. If all is ok return the
* offset of the data and return. Since it is expensive to grab
* a mutex this is only done if the mapin is not complete.
* Once the mutex is aquired it is possible that the mapin was
* not done so recheck and if necessary do the mapin.
*/
return;
}
return;
}
/*
* get the new b_addr out of the parent since bp_mapin just changed it
*/
}
/*
* NAMES: raid_read_no_retry
* DESCRIPTION: I/O retry routine for a RAID metadevice read
* read failed attempting to regenerate the data,
* no retry possible, error occured in raid_raidregenloop().
* PARAMETERS: mr_unit_t *un - pointer to raid unit structure
* md_raidcs_t *cs - pointer to child structure
*/
/*ARGSUSED*/
static void
{
/* decrement readfrags */
}
/*
* NAMES: raid_read_retry
* DESCRIPTION: I/O retry routine for a RAID metadevice read
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*/
static void
{
/* re-initialize the buf_t structure for raid_read() */
/* Initialize semaphores */
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
/*
* re-scheduling I/O with raid_read_io() is simpler. basically,
* raid_read_io() is invoked again with same child structure.
* (NOTE: we aren`t supposed to do any error recovery when an I/O
* error occured in raid_raidregenloop().
*/
}
/*
* NAMES: raid_rderr
* DESCRIPTION: I/O error handling routine for a RAID metadevice read
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
* LOCKS: must obtain unit writer lock while calling raid_error_state
* since a unit or column state transition may take place.
* must obtain unit reader lock to retry I/O.
*/
/*ARGSUSED*/
static void
{
int error = 0;
/* now attempt the appropriate retry routine */
}
/*
* NAMES: raid_read_error
* DESCRIPTION: I/O error handling routine for a RAID metadevice read
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*/
/*ARGSUSED*/
static void
{
/* now schedule processing for possible state change */
}
/*
* NAMES: getdbuffer
* DESCRIPTION: data buffer allocation for a child structure
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*
* NOTE: always get dbuffer before pbuffer
* and get both buffers before pwslot
* otherwise a deadlock could be introduced.
*/
static void
{
return;
}
}
/*
* NAMES: getpbuffer
* DESCRIPTION: parity buffer allocation for a child structure
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*
* NOTE: always get dbuffer before pbuffer
* and get both buffers before pwslot
* otherwise a deadlock could be introduced.
*/
static void
{
return;
}
}
static void
{
/*
* NOTE: always get dbuffer before pbuffer
* and get both buffers before pwslot
* otherwise a deadlock could be introduced.
*/
getdbuffer(cs);
getpbuffer(cs);
cbuf->cbuf_buffer =
}
/*
* NAMES: freebuffers
* DESCRIPTION: child structure buffer freeing routine
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*/
static void
{
/* free buffers used for full line write */
continue;
cbuf->cbuf_bcount = 0;
}
}
if (cs->cs_dbuffer) {
else
}
if (cs->cs_pbuffer) {
else
}
}
}
/*
* NAMES: raid_line_reader_lock, raid_line_writer_lock
* DESCRIPTION: RAID metadevice line reader and writer lock routines
* data column # and parity column #.
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*/
void
{
if (!panicstr)
break;
if (panicstr)
panic("md; raid line write lock held");
}
}
if (resync_thread) {
}
if (!panicstr)
}
int
{
break;
if (panicstr)
panic("md: line writer lock inaccessible");
goto no_lock_exit;
}
if (raid_alloc_pwslot(cs)) {
if (panicstr)
panic("md: no prewrite slots");
goto no_lock_exit;
}
return (0);
/* if this is already queued then do not requeue it */
return (1);
return (1);
}
static void
{
}
void
{
/*
* check to be sure there are no reader locks outstanding. If
* there are not then pass on the writer lock.
*/
while (waiting_list) {
break;
/*
* there was an IOs that overlaps this io so go onto
* the next io in the waiting list
*/
if (cs1) {
continue;
}
/*
* There are no IOs that overlap this, so remove it from
* the waiting queue, and start it
*/
if (raid_check_pw(waiting_list)) {
continue;
}
if (previous)
else
if (raid_line_writer_lock(waiting_list, 0))
panic("region locking corrupted");
waiting_list = next;
}
}
void
{
else
if (cs->cs_linlck_prev)
else
if (cs->cs_linlck_next)
if (un->un_linlck_flg)
un->un_linlck_flg = 0;
/*
* now that the lock is droped go ahead and see if there are any
* other writes that can be started up
*/
}
/*
* NAMES: raid_line, raid_pcolumn, raid_dcolumn
* DESCRIPTION: RAID metadevice APIs for mapping segment # to line #,
* data column # and parity column #.
* PARAMETERS: int segment - segment number
* mr_unit_t *un - pointer to an unit structure
* RETURNS: raid_line returns line #
* raid_dcolumn returns data column #
* raid_pcolumn returns parity column #
*/
static diskaddr_t
{
if (segment >= max_orig_segment) {
} else {
}
return (line);
}
{
if (segment >= max_orig_segment) {
} else {
}
return (column);
}
{
if (segment >= max_orig_segment) {
} else {
}
return (column);
}
/*
* Is called in raid_iosetup to probe each column to insure
* that all the columns are in 'okay' state and meet the
* 'full line' requirement. If any column is in error,
* we don't want to enable the 'full line' flag. Previously,
* we would do so and disable it only when a error is
* detected after the first 'full line' io which is too late
* and leads to the potential data corruption.
*/
static int
{
char *buf;
int i;
int err = 0;
for (i = 0; i < un->un_totalcolumncnt; i++) {
/*
* Open by device id
* If this device is hotspared
* use the hotspare key
*/
err = 1;
break;
}
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
err = 1;
break;
}
}
return (err);
}
/*
* NAME: raid_iosetup
* DESCRIPTION: RAID metadevice specific I/O set up routine which does
* all the necessary calculations to determine the location
* of the segement for the I/O.
* PARAMETERS: mr_unit_t *un - unit number of RAID metadevice
* diskaddr_t blkno - block number of the I/O attempt
* size_t blkcnt - block count for this I/O
* md_raidcs_t *cs - child structure for each segmented I/O
*
* NOTE: The following is an example of a raid disk layer out:
*
* Total Column = 5
* Original Column = 4
* Segment Per Column = 10
*
* Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6
* -------------------------------------------------------------
* line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40
* line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31
* line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32
* line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33
* line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34
* line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35
* line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36
* line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37
* line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38
* line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39
*/
static size_t
)
{
/* caculate the segment# and offset for the block */
(segoff == 0) &&
(raid_check_cols(un) == 0)) {
int i, j;
for (i = 0; i < un->un_totalcolumncnt; i++) {
j = cs->cs_dcolumn + i;
j = j % un->un_totalcolumncnt;
continue;
cbuf->cbuf_column = j;
}
return (leftover);
}
else
leftover = 0;
}
/* calculate the line# and column# for the segment */
return (leftover);
}
/*
* NAME: raid_done
* DESCRIPTION: RAID metadevice I/O done interrupt routine
* PARAMETERS: struct buf *bp - pointer to a buffer structure
*/
static void
{
}
if (frags != 0) {
return;
}
if (flags & MD_RCS_ERROR) {
if (cs->cs_error_call) {
}
return;
}
if (flags & MD_RCS_ISCALL) {
return;
}
}
/*
* the flag RIO_EXTRA is used when dealing with a column in the process
* of being resynced. During the resync, writes may have to take place
* on both the original component and a hotspare component.
*/
/*
* NAME: raidio
* DESCRIPTION: RAID metadevice write routine
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
int column;
int flag;
void *private;
int iosize;
else
} else {
else
}
if (flags & RIO_COLMASK)
/* check if the hotspared device will be used */
} else {
}
/* if not writing to log skip log header */
} else {
} else { /* not DATA -> PARITY */
}
}
} else {
}
}
/*
* NAME: genstandardparity
* DESCRIPTION: This routine
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
/* Word aligned */
while (wordcnt--) {
++pbuf;
++dbuf;
}
} else {
union {
} cb;
while (wordcnt--) {
++pbuf;
++dbuf;
}
}
}
static void
{
/* Word aligned */
while (wordcnt--) {
++pbuf;
++dbuf;
}
} else {
union {
} cb;
while (wordcnt--) {
++pbuf;
++dbuf;
}
}
dsum = 0;
/* Word aligned */
/*
* Only calculate psum when working on the last
* data buffer.
*/
psum = 0;
while (wordcnt--) {
++dbuf;
++pbuf;
}
} else {
while (wordcnt--) {
++dbuf;
++pbuf;
}
}
} else {
union {
} cb;
/*
* Only calculate psum when working on the last
* data buffer.
*/
psum = 0;
while (wordcnt--) {
++dbuf;
++pbuf;
}
} else {
while (wordcnt--) {
++dbuf;
++pbuf;
}
}
}
/*
* fill in buffer for write to prewrite area
*/
if (nv_available && nv_prewrite)
}
}
/*
* NAME: raid_readregenloop
* DESCRIPTION: RAID metadevice write routine
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
/*
* XOR the parity with data bytes, must skip the
*/
while (wordcnt--)
/* bump up the loop count */
/* skip the errored component */
return;
}
/* reaching the end sof loop */
/* decrement readfrags */
}
/*
* NAME: raid_read_io
* DESCRIPTION: RAID metadevice read I/O routine
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* md_raidcs_t *cs - pointer to a child structure
*/
static void
{
int flag;
void *private;
/*
* The component to be read is good, simply set up bp structure
* and call low level md routine doing the read.
*/
return;
}
/*
* The component to be read is bad, have to go through
* raid specific method to read data from other members.
*/
/*
* NOTE: always get dbuffer before pbuffer
* and get both buffers before pwslot
* otherwise a deadlock could be introduced.
*/
getdbuffer(cs);
getpbuffer(cs);
/* zero out data buffer for use as a data sink */
/* use parity buffer to read other columns */
}
/*
* NAME: raid_read
* DESCRIPTION: RAID metadevice write routine
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* md_raidcs_t *cs - pointer to a child structure
*/
static int
{
int error = 0;
raid_line_reader_lock(cs, 0);
/* make sure the read doesn't go beyond the end of the column */
}
if (error)
goto rerror;
}
return (0);
/* decrement readfrags */
return (0);
}
/*
* NAME: raid_write_err_retry
* DESCRIPTION: RAID metadevice write retry routine
* write was for parity or data only;
* complete write with error, no recovery possible
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* md_raidcs_t *cs - pointer to a child structure
*/
/*ARGSUSED*/
static void
{
/* decrement pwfrags if needed, and frags */
}
/*
* NAME: raid_write_err_retry
* DESCRIPTION: RAID metadevice write retry routine
* write is too far along to retry and parent
* has already been signaled with iodone.
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* md_raidcs_t *cs - pointer to a child structure
*/
/*ARGSUSED*/
static void
{
/* decrement pwfrags if needed, and frags */
}
/*
* NAME: raid_write_retry
* DESCRIPTION: RAID metadevice write retry routine
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* md_raidcs_t *cs - pointer to a child structure
*/
static void
{
/* re-initialize the buf_t structure for raid_write() */
/* Initialize semaphores */
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
SEMA_DEFAULT, NULL);
/*
* If we have already done'ed the i/o but have done prewrite
* on this child, then reset PWDONE flag and bump pwfrags before
* restarting i/o.
* If pwfrags is zero, we have already 'iodone'd the i/o so
* leave things alone. We don't want to re-'done' it.
*/
ps->ps_pwfrags++;
}
}
/*
* NAME: raid_wrerr
* DESCRIPTION: RAID metadevice write routine
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
* LOCKS: must obtain unit writer lock while calling raid_error_state
* since a unit or column state transition may take place.
* must obtain unit reader lock to retry I/O.
*/
static void
{
/* now attempt the appropriate retry routine */
}
/*
* NAMES: raid_write_error
* DESCRIPTION: I/O error handling routine for a RAID metadevice write
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*/
/*ARGSUSED*/
static void
{
/*
* locate each buf that is in error on this io and then
* output an error message
*/
NULL, 0));
/* now schedule processing for possible state change */
}
/*
* NAME: raid_write_ponly
* DESCRIPTION: RAID metadevice write routine
* in the case where only the parity column can be written
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
/* decrement pwfrags if needed, but not frags */
}
}
/*
* NAME: raid_write_ploop
* DESCRIPTION: RAID metadevice write routine, constructs parity from
* data in other columns.
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
while (wordcnt--)
/*
* build parity from scratch using new data,
* skip reading the data and parity columns.
*/
return;
}
/* construct checksum for parity buffer */
while (wordcnt--) {
pbuf++;
}
}
}
/*
* NAME: raid_write_donly
* DESCRIPTION: RAID metadevice write routine
* Completed writing data to prewrite entry
* in the case where only the data column can be written
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
/* WARNING: don't release unit reader lock here... */
/* decrement pwfrags if needed, but not frags */
}
}
/*
* NAME: raid_write_got_old
* DESCRIPTION: RAID metadevice write routine
* completed read of old data and old parity
* PARAMETERS: md_raidcs_t *cs - pointer to a child structure
*/
static void
{
}
}
}
/*
* NAME: raid_write_io
* DESCRIPTION: RAID metadevice write I/O routine
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* md_raidcs_t *cs - pointer to a child structure
*/
/*ARGSUSED*/
static void
{
int pcheck;
int dcheck;
RCS_INIT) == 0);
RCS_INIT) == 0);
return;
}
RCS_LAST_ERRED) ||
| RFP_RLS_LOCK | RFP_DECR_PWFRAGS);
return;
}
/*
* handle case of only having data drive
*/
while (wordcnt--) {
dbuf++;
ubuf++;
}
}
return;
}
/*
* handle case of only having parity drive
* build parity from scratch using new data,
* skip reading the data and parity columns.
*/
/* copy new data in to begin building parity */
return;
}
/*
* handle normal cases
* read old data and old parity
*/
}
static void
{
} else {
}
}
/*
* NAME: raid_write
* DESCRIPTION: RAID metadevice write routine
* PARAMETERS: mr_unit_t *un - pointer to a unit structure
* md_raidcs_t *cs - pointer to a child structure
*/
/*ARGSUSED*/
static int
{
int error = 0;
/* make sure the write doesn't go beyond the column */
if (error)
goto werror;
/*
* this is an advisory loop that keeps the waiting lists short
* to reduce cpu time. Since there is a race introduced by not
* aquiring all the correct mutexes, use a cv_timedwait to be
* sure the write always will wake up and start.
*/
while (raid_check_pw(cs)) {
}
return (0);
}
return (0);
/* aquire unit reader lock sinc raid_free_child always drops it */
raid_free_child(cs, 0);
/* decrement both pwfrags and frags */
return (0);
}
/*
* NAMES: raid_stage
* DESCRIPTION: post-processing routine for a RAID metadevice
* PARAMETERS: md_raidcs_t *cs - pointer to child structure
*/
static void
{
void *private;
int flag;
case RAID_READ_DONE:
/* decrement readfrags */
return;
case RAID_WRITE_DONE:
case RAID_WRITE_PONLY_DONE:
case RAID_WRITE_DONLY_DONE:
/*
*/
/* decrement frags but not pwfrags */
return;
case RAID_PREWRITE_DONE:
/*
* completed writing data and parity to prewrite entries
*/
/*
* WARNING: don't release unit reader lock here..
* decrement pwfrags but not frags
*/
}
}
if (cs->cs_pw_inval_list) {
}
return;
case RAID_LINE_PWDONE:
/*
* fill in buffer for write to prewrite area
*/
}
if (cs->cs_pw_inval_list) {
}
return;
default:
ASSERT(0);
break;
}
}
/*
* NAME: md_raid_strategy
* DESCRIPTION: RAID metadevice I/O oprations entry point.
* PARAMETERS: buf_t *pb - pointer to a user I/O buffer
* int flag - metadevice specific flag
* void *private - carry over flag ??
*
*/
void
{
int doing_writes;
int err;
int colcnt;
if ((flag & MD_NOBLOCK) == 0) {
if (md_inc_iocount(setno) != 0) {
return;
}
} else {
}
(void *) md_unit_readerlock(ui);
if (!(flag & MD_STR_NOTTOP)) {
if (err != 0) {
return;
}
}
/* allocate a parent structure for the user I/O */
/*
* Save essential information from the original buffhdr
* in the md_save structure.
*/
doing_writes = 1;
} else {
doing_writes = 0;
}
addr = 0;
offset = 0;
do {
} else {
}
/* for each cs bump up the ps_pwfrags and ps_frags fields */
if (count) {
ps->ps_pwfrags++;
if (doing_writes)
else
}
} while (count);
if (doing_writes) {
} else
drv_usecwait(10);
}
}
}
/*
* NAMES: raid_snarf
* DESCRIPTION: RAID metadevice SNARF entry point
* PARAMETERS: md_snarfcmd_t cmd,
* set_t setno
* RETURNS:
*/
static int
{
int gotsomething;
int all_raid_gotten;
if (cmd == MD_SNARF_CLEANUP)
return (0);
all_raid_gotten = 1;
gotsomething = 0;
continue;
}
switch (rbp->rb_revision) {
case MDDB_REV_RB:
case MDDB_REV_RBFN:
/*
* This means, we have an old and small record
* and this record hasn't already been
* converted. Before we create an incore
* metadevice from this we have to convert it to
* a big record.
*/
small_un =
newreqsize = sizeof (mr_unit_t) +
KM_SLEEP);
} else {
/*
* Record has already been converted. Just
* get its address.
*/
}
break;
case MDDB_REV_RB64:
case MDDB_REV_RB64FN:
/* Big device */
break;
}
/*
* Create minor device node for snarfed entry.
*/
continue;
}
all_raid_gotten = 0;
gotsomething = 1;
}
}
if (!all_raid_gotten) {
return (gotsomething);
}
return (0);
}
/*
* NAMES: raid_halt
* DESCRIPTION: RAID metadevice HALT entry point
* PARAMETERS: md_haltcmd_t cmd -
* set_t setno -
* RETURNS:
*/
static int
{
set_t i;
if (cmd == MD_HALT_CLOSE)
return (0);
if (cmd == MD_HALT_OPEN)
return (0);
if (cmd == MD_HALT_UNLOAD)
return (0);
if (cmd == MD_HALT_CHECK) {
for (i = 0; i < md_nunits; i++) {
continue;
continue;
if (md_unit_isopen(ui))
return (1);
}
return (0);
}
if (cmd != MD_HALT_DOIT)
return (1);
for (i = 0; i < md_nunits; i++) {
continue;
continue;
}
return (0);
}
/*
* NAMES: raid_close_all_devs
* DESCRIPTION: Close all the devices of the unit.
* PARAMETERS: mr_unit_t *un - pointer to unit structure
* RETURNS:
*/
void
{
int i;
for (i = 0; i < un->un_totalcolumncnt; i++) {
device->un_pwstart, i);
}
}
}
/*
* NAMES: raid_open_all_devs
* DESCRIPTION: Open all the components (columns) of the device unit.
* PARAMETERS: mr_unit_t *un - pointer to unit structure
* RETURNS:
*/
static int
{
int i;
int not_opened = 0;
int commit = 0;
for (i = 0; i < un->un_totalcolumncnt; i++) {
not_opened++;
continue;
}
continue;
/*
* Open by device id
*/
}
not_opened++;
continue;
}
}
/* if open errors and errored devices are 1 then device can run */
if (not_opened > 1) {
"md: %s failed to open. open error on %s\n",
return (not_opened > 1);
}
for (i = 0; i < un->un_totalcolumncnt; i++) {
/*
* At this point in time there is a possibility
* that errors were the result of a controller
* failure with more than a single column on it
* so clear out last errored columns and let errors
* re-occur is necessary.
*/
commit++;
}
continue;
}
col = i;
}
if (col != -1) {
commit++;
}
if (commit)
if (col != -1) {
}
}
return (0);
}
/*
* NAMES: raid_internal_open
* DESCRIPTION: Do the actual RAID open
* PARAMETERS: minor_t mnum - minor number of the RAID device
* int flag -
* int otyp -
* int md_oflags - RAID open flags
* RETURNS: 0 if successful, nonzero otherwise
*/
int
{
int err = 0;
int replay_error = 0;
/*
* this MUST be checked before md_unit_isopen is checked.
* raid_init_columns sets md_unit_isopen to block reset, halt.
*/
!(md_oflags & MD_OFLG_ISINIT)) {
return (EAGAIN);
}
goto out;
}
goto out;
}
} else {
/*
* if this unit contains more than two errored components
* should return error and close all opened devices
*/
return (ENXIO);
}
}
if ((replay_error == RAID_RPLY_READONLY) &&
return (0);
}
/* allocate hotspare if possible */
(void) raid_hotspares();
out:
return (err);
}
/*
* NAMES: raid_open
* DESCRIPTION: RAID metadevice OPEN entry point
* PARAMETERS: dev_t dev -
* int flag -
* int otyp -
* cred_t * cred_p -
* int md_oflags -
* RETURNS:
*/
/*ARGSUSED1*/
static int
{
int error = 0;
return (error);
}
return (0);
}
/*
* NAMES: raid_internal_close
* DESCRIPTION: RAID metadevice CLOSE actual implementation
* PARAMETERS: minor_t - minor number of the RAID device
* int otyp -
* int init_pw -
* int md_cflags - RAID close flags
* RETURNS: 0 if successful, nonzero otherwise
*/
/*ARGSUSED*/
int
{
int err = 0;
/* single thread */
/* count closed */
goto out;
/* close devices, if necessary */
}
/* unlock, return success */
out:
return (err);
}
/*
* NAMES: raid_close
* DESCRIPTION: RAID metadevice close entry point
* PARAMETERS: dev_t dev -
* int flag -
* int otyp -
* cred_t * cred_p -
* int md_oflags -
* RETURNS:
*/
/*ARGSUSED1*/
static int
{
int retval;
return (retval);
}
/*
* raid_probe_close_all_devs
*/
void
{
int i;
for (i = 0; i < un->un_totalcolumncnt; i++) {
}
}
}
/*
* Raid_probe_dev:
*
* On entry the unit writerlock is held
*/
static int
{
int i;
int not_opened = 0;
int commit = 0;
int md_devopen = 0;
if (md_unit_isopen(ui))
md_devopen++;
/*
* If the state has been set to LAST_ERRED because
* of an error when the raid device was open at some
* point in the past, don't probe. We really don't want
* to reset the state in this case.
*/
return (0);
for (i = 0; i < un->un_totalcolumncnt; i++) {
not_opened++;
continue;
}
/*
* Currently the flags passed are not needed since
* there cannot be an underlying metadevice. However
* they are kept here for consistency.
*
* Open by device id
*/
not_opened++;
continue;
}
}
/*
* The code below is careful on setting the LAST_ERRED state.
*
* If open errors and exactly one device has failed we can run.
* If more then one device fails we have to figure out when to set
* LAST_ERRED state. The rationale is to avoid unnecessary resyncs
* since they are painful and time consuming.
*
*
* 1. Metadevice has NOT been opened: In this case, the behavior
* mimics the open symantics. ie. Only the first failed device
* is ERRED and LAST_ERRED is not set.
*
* followed. The first failed devicce is ERRED and on the next
* failed device LAST_ERRED is set.
*/
"md: %s failed to open. open error on %s\n",
return (not_opened > 1);
}
if (!md_devopen) {
for (i = 0; i < un->un_totalcolumncnt; i++) {
/*
* At this point in time there is a
* possibility that errors were the
* result of a controller failure with
* more than a single column on it so
* clear out last errored columns and
* let errors re-occur is necessary.
*/
commit++;
}
continue;
}
/*
* note if multiple devices are failing then only
* the last one is marked as error
*/
col = i;
}
if (col != -1) {
commit++;
}
} else {
for (i = 0; i < un->un_totalcolumncnt; i++) {
/* if we have LAST_ERRED go ahead and commit. */
break;
/*
* could not open the component
*/
col = i;
commit++;
}
}
}
if (commit)
if (col != -1) {
}
}
return (0);
}
static int
)
{
int i, gotsomething;
gotsomething = 0;
continue;
switch (rbp->rb_revision) {
case MDDB_REV_RB:
case MDDB_REV_RBFN:
/*
* Small device
*/
for (i = 0; i < un32->un_totalcolumncnt; i++) {
goto out;
}
break;
case MDDB_REV_RB64:
case MDDB_REV_RB64FN:
for (i = 0; i < un64->un_totalcolumncnt; i++) {
goto out;
}
break;
}
/*
* If this is a top level and a friendly name metadevice,
* update its minor in the namespace.
*/
if ((*parent_id == MD_NO_PARENT) &&
goto out;
}
/*
* Update unit with the imported setno
*/
if (*hsp_id != -1)
if (*parent_id != MD_NO_PARENT)
gotsomething = 1;
}
out:
return (gotsomething);
}
{raid_hotspares, "poke hotspares" },
{NULL, 0 }
};
raid_open, /* open */
raid_close, /* close */
md_raid_strategy, /* strategy */
NULL, /* print */
NULL, /* dump */
NULL, /* read */
NULL, /* write */
md_raid_ioctl, /* ioctl, */
raid_snarf, /* raid_snarf */
raid_halt, /* raid_halt */
NULL, /* aread */
NULL, /* awrite */
raid_imp_set, /* import set */
};
static void
{
/* default to a second */
if (md_wr_wait == 0)
sizeof (md_raidps_t), 0, raid_parent_constructor,
sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor,
}
static void
{
}
/* define the module linkage */