vdev_raidz.c revision 44cd46cadd9aab751dae6a4023c1cb5bf316d274
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
#include <sys/vdev_impl.h>
#include <sys/zio_checksum.h>
/*
* Virtual device vector for RAID-Z.
*/
/*
* We currently allow up to two-way replication (i.e. single-fault
* reconstruction) models in RAID-Z vdevs. The blocks in such vdevs
* must all be multiples of two times the leaf vdev blocksize.
*/
#define VDEV_RAIDZ_ALIGN 2ULL
typedef struct raidz_col {
void *rc_data;
int rc_error;
short rc_tried;
short rc_skipped;
} raidz_col_t;
typedef struct raidz_map {
int rm_missing_child;
int rm_firstdatacol;
} raidz_map_t;
static raidz_map_t *
{
int firstdatacol;
q = s / (dcols - 1);
r = s - q * (dcols - 1);
bc = r + !!r;
firstdatacol = 1;
for (c = 0; c < acols; c++) {
col = f + c;
coff = o;
}
}
for (c = 0; c < rm->rm_firstdatacol; c++)
for (c = c + 1; c < acols; c++)
/*
* To prevent hot parity disks, switch the parity and data
* columns every 1MB.
*/
}
return (rm);
}
static void
{
int c;
for (c = 0; c < rm->rm_firstdatacol; c++)
}
static void
{
int i, c;
if (c == x)
continue;
if (c == !x) {
/*
* The initial copy happens at either c == 0 or c == 1.
* Both of these columns are 'big' columns, so we'll
* definitely initialize all of column x.
*/
for (i = 0; i < count; i++)
} else {
for (i = 0; i < count; i++)
}
}
}
static int
{
int c, error;
int lasterror = 0;
int numerrors = 0;
/*
* XXX -- minimum children should be raid-type-specific
*/
return (EINVAL);
}
for (c = 0; c < vd->vdev_children; c++) {
numerrors++;
continue;
}
}
if (numerrors > 1) {
return (lasterror);
}
return (0);
}
static void
{
int c;
for (c = 0; c < vd->vdev_children; c++)
}
static uint64_t
{
return (asize);
}
static void
{
rc->rc_skipped = 0;
}
static void
{
}
static void
{
int c;
/*
* Generate RAID parity in virtual column 0.
*/
vdev_raidz_reconstruct(rm, 0);
}
return;
}
if (vdev_is_dead(cvd)) {
rm->rm_missing_child = c;
continue;
}
rm->rm_missing_child = c;
continue;
}
}
}
}
/*
* Report a checksum error for a child of a RAID-Z device.
*/
static void
{
}
}
static void
{
int unexpected_errors = 0;
int c;
zio->io_numerrors = 0;
/*
* We preserve any EIOs because those may be worth retrying;
* whereas ECKSUM and ENXIO are more likely to be persistent.
*/
if (!rc->rc_skipped)
zio->io_numerrors++;
}
}
/*
* If this is not a failfast write, and we were able to
* write enough columns to reconstruct the data, good enough.
*/
/* XXPOLICY */
return;
}
/*
* If there were no I/O errors, and the data checksums correctly,
* the read is complete.
*/
/* XXPOLICY */
ASSERT(unexpected_errors == 0);
/*
* We know the data's good. If we read the parity,
* verify that it's good as well. If not, fix it.
*/
for (c = 0; c < rm->rm_firstdatacol; c++) {
void *orig;
continue;
vdev_raidz_reconstruct(rm, c);
}
}
goto done;
}
/*
* If there was exactly one I/O error, it's the one we expected,
* and the reconstructed data checksums, the read is complete.
* This happens when one child is offline and vdev_fault_assess()
* knows it, or when one child has stale data and the DTL knows it.
*/
ASSERT(unexpected_errors == 0);
vdev_raidz_reconstruct(rm, c);
if (zio_checksum_error(zio) == 0) {
goto done;
}
}
/*
* This isn't a typical error -- either we got a read error or
* more than one child claimed a problem. Read every block we
* haven't already so we can try combinatorial reconstruction.
*/
unexpected_errors = 1;
break;
continue;
}
return;
}
/*
* If there were more errors than parity disks, give up.
*/
goto done;
}
/*
* The number of I/O errors is correctable. Correct them here.
*/
vdev_raidz_reconstruct(rm, c);
if (zio_checksum_error(zio) == 0)
else
goto done;
}
}
/*
* There were no I/O errors, but the data doesn't checksum.
* Try all permutations to see if we can find one that does.
*/
void *orig;
vdev_raidz_reconstruct(rm, c);
if (zio_checksum_error(zio) == 0) {
/*
* If this child didn't know that it returned bad data,
* inform it.
*/
goto done;
}
}
/*
* All combinations failed to checksum. Generate checksum ereports for
* every one.
*/
}
}
done:
/*
* Use the good data we have in hand to repair damaged children.
*
* that vdev_raidz_map_free(zio) will be invoked after all
* repairs complete, but before we advance to the next stage.
*/
continue;
dprintf("%s resilvered %s @ 0x%llx error %d\n",
}
return;
}
}
static void
{
if (faulted > 1)
else
}
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};