zfs_fm.c revision 22fe2c8844be88ebae6478ca1b0b92c8ec2aef54
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio_checksum.h>
#include <sys/sysevent.h>
/*
* This general routine is responsible for generating all the different ZFS
* ereports. The payload is dependent on the class, and which arguments are
* supplied to the function:
*
* EREPORT POOL VDEV IO
* block X X X
* data X X
* device X X
* pool X
*
* If we are in a loading state, all errors are chained together by the same
* SPA-wide ENA (Error Numeric Association).
*
* For isolated I/O requests, we get the ENA from the zio_t. The propagation
* gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
* to chain together all ereports associated with a logical piece of data. For
* layered diagram:
*
* +---------------+
* | Aggregate I/O | No associated logical data or device
* +---------------+
* |
* V
* +---------------+ Reads associated with a piece of logical data.
* | Read I/O | This includes reads on behalf of RAID-Z,
* +---------------+ mirrors, gang blocks, retries, etc.
* |
* V
* +---------------+ Reads associated with a particular device, but
* | Physical I/O | no logical data. Issued as part of vdev caching
* +---------------+ and I/O aggregation.
*
* Note that 'physical I/O' here is not the same terminology as used in the rest
* of ZIO. Typically, 'physical I/O' simply means that there is no attached
* blockpointer. But I/O with no associated block pointer can still be related
* to a logical piece of data (i.e. RAID-Z requests).
*
* Purely physical I/O always have unique ENAs. They are not related to a
* particular piece of logical data, and therefore cannot be chained together.
* We still generate an ereport, but the DE doesn't correlate it with any
* logical piece of data. When such an I/O fails, the delegated I/O requests
* will issue a retry, which will trigger the 'real' ereport with the correct
* ENA.
*
* We keep track of the ENA for a ZIO chain through the 'io_logical' member.
* then inherit this pointer, so that when it is first set subsequent failures
* will use the same ENA. For vdev cache fill and queue aggregation I/O,
* this pointer is set to NULL, and no ereport will be generated (since it
* doesn't actually correspond to any particular device or piece of data,
* and the caller will always retry without caching or queueing anyway).
*
* For checksum errors, we want to include more information about the actual
* error which occurs. Accordingly, we build an ereport when the error is
* noticed, but instead of sending it in immediately, we hang it off of the
* io_cksum_report field of the logical IO. When the logical IO completes
* (successfully or not), zfs_ereport_finish_checksum() is called with the
* good and bad versions of the buffer (if available), and we annotate the
* ereport with information about the differences.
*/
#ifdef _KERNEL
static void
{
char class[64];
/*
* If we are doing a spa_tryimport(), ignore errors.
*/
return;
/*
* If we are in the middle of opening a pool, and the previous attempt
* failed, don't bother logging any new ereports - we're just going to
* get the same diagnosis anyway.
*/
return;
/*
* If this is not a read or write zio, ignore the error. This
* can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
*/
return;
/*
* expected result.
*/
return;
/*
* If this I/O is not a retry I/O, don't post an ereport.
* Otherwise, we risk making bad diagnoses based on B_FAILFAST
*/
return;
/*
* If the vdev has already been marked as failing due
* to a failed probe, then ignore any subsequent I/O
* errors, as the DE will automatically fault the vdev
* on the first such failure. This also catches cases
* where vdev_remove_wanted is set and the device has
* not yet been asynchronously placed into the REMOVED
* state.
*/
return;
/*
* Ignore checksum errors for reads from DTL regions of
* leaf vdevs.
*/
return;
}
}
/*
* For probe failure, we want to avoid posting ereports if we've
* already removed the device in the meantime.
*/
return;
return;
return;
}
/*
* Serialize ereport generation
*/
/*
* Determine the ENA to use for this event. If we are in a loading
* state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
* a root zio-wide ENA. Otherwise, simply use a unique ENA.
*/
} else {
}
/*
* Construct the full class, detector, and other standard FMA fields.
*/
/*
* Construct the per-ereport payload, depending on which parameters are
* passed in.
*/
/*
* Generic payload members common to all ereports.
*/
NULL);
}
NULL);
if (pvd->vdev_devid)
}
}
/*
*/
/*
* If the 'size' parameter is non-zero, it indicates this is a
* RAID-Z or other I/O where the physical offset and length are
* provided for us, instead of within the zio_t.
*/
if (size)
else
}
/*
*/
/*
* If we have a vdev but no zio, this is a device fault, and the
* 'stateoroffset' parameter indicates the previous state of the
* vdev.
*/
}
*ereport_out = ereport;
*detector_out = detector;
}
/* if it's <= 128 bytes, save the corruption directly */
#define MAX_RANGES 16
typedef struct zfs_ecksum_info {
/* histograms of set and cleared bits by bit number in a 64-bit word */
/* inline arrays of bits set and cleared. */
/*
* for each range, the number of bits set and cleared. The Hamming
* distance between the good and bad buffers is the sum of them all.
*/
struct zei_ranges {
} zei_ranges[MAX_RANGES];
static void
{
size_t i;
/* We store the bits in big-endian (largest-first) order */
for (i = 0; i < 64; i++) {
if (value & (1ull << i)) {
hist[63 - i]++;
++bits;
}
}
/* update the count of bits changed */
}
/*
* We've now filled up the range array, and need to increase "mingap" and
* shrink the range list accordingly. zei_mingap is always the smallest
* distance between array entries, so we set the new_allowed_gap to be
* one greater than that. We then go through the list, joining together
* any ranges which are closer than the new_allowed_gap.
*
* By construction, there will be at least one. We also update zei_mingap
* to the new smallest gap, to prepare for our next invocation.
*/
static void
{
idx++;
if (gap < new_allowed_gap) {
continue;
}
break;
}
output++;
}
}
static void
{
if (count >= MAX_RANGES) {
}
if (count == 0) {
} else {
return;
}
}
eip->zei_range_count++;
}
static size_t
{
return (result);
}
static zfs_ecksum_info_t *
{
uint64_t allcleared = 0;
int no_inline = 0;
/* don't do any annotation for injected checksum errors */
return (eip);
NULL);
if (info->zbc_byteswapped) {
NULL);
}
}
return (eip);
/* build up the range list by comparing the two buffers. */
if (start == -1)
continue;
start = -1;
} else {
if (start != -1)
continue;
}
}
if (start != -1)
/* See if it will fit in our inline buffers */
if (inline_size > ZFM_MAX_INLINE)
no_inline = 1;
/*
* If there is no change and we want to drop if the buffers are
* identical, do so.
*/
if (inline_size == 0 && drop_if_identical) {
return (NULL);
}
/*
* Now walk through the ranges, filling in the details of the
* differences. Also convert our uint64_t-array offsets to byte
* offsets.
*/
// bits set in bad, but not in good
// bits set in good, but not in bad
allcleared |= cleared;
if (!no_inline) {
offset++;
}
}
/* convert to byte offsets */
}
inline_size *= sizeof (uint64_t);
/* fill in ereport */
NULL);
if (!no_inline) {
NULL);
} else {
NULL);
}
return (eip);
}
#endif
void
{
#ifdef _KERNEL
return;
#endif
}
void
{
else
/* copy the checksum failure information if it was provided */
}
#ifdef _KERNEL
return;
}
#endif
}
void
{
#ifdef _KERNEL
#endif
}
void
{
#ifdef _KERNEL
}
#endif
}
void
{
#ifdef _KERNEL
#endif
}
void
{
#ifdef _KERNEL
return;
B_FALSE);
#endif
}
static void
{
#ifdef _KERNEL
char class[64];
return;
return;
if (vd)
#endif
}
/*
* The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
* has been removed from the system. This will cause the DE to ignore any
* recent I/O errors, inferring that they are due to the asynchronous device
* removal.
*/
void
{
}
/*
* The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
* has the 'autoreplace' property set, and therefore any broken vdevs will be
* handled by higher level logic, and no vdev fault should be generated.
*/
void
{
}