zfs_de.c revision 11027bc778dc4f44eabf0c8bc54260ea890b0a15
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#include <assert.h>
#include <stddef.h>
#include <strings.h>
#include <libuutil.h>
#include <libzfs.h>
/*
* Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
* #define reserves enough space for two 64-bit hex values plus the length of
* the longest string.
*/
/*
* On-disk case structure. This must maintain backwards compatibility with
* previous versions of the DE. By default, any members appended to the end
* will be filled with zeros if they don't exist in a previous version.
*/
typedef struct zfs_case_data {
int zc_has_timer; /* defunct */
int zc_pool_state;
char zc_serd_checksum[MAX_SERDLEN];
char zc_serd_io[MAX_SERDLEN];
int zc_has_remove_timer;
/*
* Time-of-day
*/
typedef struct er_timeval {
} er_timeval_t;
/*
* In-core case structure.
*/
typedef struct zfs_case {
char *zc_fru;
} zfs_case_t;
#define CASE_DATA "data"
#define CASE_FRU "fru"
#define CASE_DATA_VERSION_INITIAL 1
#define CASE_DATA_VERSION_SERD 2
typedef struct zfs_de_stats {
};
static hrtime_t zfs_remove_timeout;
#define ZFS_MAKE_RSRC(type) \
#define ZFS_MAKE_EREPORT(type) \
/*
* Write out the persistent representation of an active case.
*/
static void
{
/*
* Always update cases to the latest version, even if they were the
* previous version when unserialized.
*/
}
/*
* Read back the persistent representation of an active case.
*/
static zfs_case_t *
{
return (NULL);
}
frulen);
}
/*
* fmd_buf_read() will have already zeroed out the remainder of the
* buffer, so we don't have to do anything special if the version
* doesn't include the SERD engine name.
*/
return (zcp);
}
/*
* Iterate over any active cases. If any cases are associated with a pool or
* vdev which is no longer present on the system, close the associated case.
*/
static void
{
int ret;
/*
* Mark any cases associated with this (pool, vdev) pair.
*/
}
}
/*
* Iterate over all children.
*/
&children) == 0) {
for (c = 0; c < children; c++)
}
&children) == 0) {
for (c = 0; c < children; c++)
}
&children) == 0) {
for (c = 0; c < children; c++)
}
}
/*ARGSUSED*/
static int
{
er_timeval_t loaded = { 0 };
int ret;
/*
* Mark any cases associated with just this pool.
*/
}
return (-1);
}
if (nelem == 2) {
}
}
}
return (0);
}
struct load_time_arg {
};
static int
{
return (0);
return (0);
return (-1);
}
}
return (0);
}
static void
{
/*
* There is no way to open a pool by GUID, or lookup a vdev by GUID. No
* matter what we do, we're going to have to stomach a O(vdevs * cases)
* algorithm. In reality, both quantities are likely so small that
* neither will matter. Given that iterating over pools is more
* expensive than iterating over the in-memory case list, we opt for a
* 'present' flag in each case that starts off cleared. We then iterate
* over all pools, marking those that are still present, and removing
* those that aren't found.
*
* Note that we could also construct an FMRI and rely on
* fmd_nvl_fmri_present(), but this would end up doing the same search.
*/
/*
* Mark the cases an not present.
*/
/*
* Iterate over all pools and mark the pools and vdevs found. If this
* fails (most probably because we're out of memory), then don't close
* any of the cases and we cannot be sure they are accurate.
*/
return;
/*
* Remove those cases which were not found.
*/
if (!zcp->zc_present)
}
}
/*
* checksum).
*/
static void
const char *type)
{
}
/*
* Solve a given ZFS case. This first checks to make sure the diagnosis is
* still valid, as well as cleaning up any pending timer associated with the
* case.
*/
static void
{
int err;
/*
* Construct the detector from the case data. The detector is in the
* ZFS scheme, and is either the pool or the vdev, depending on whether
* this is a vdev or pool fault.
*/
}
/*
* We also want to make sure that the detector (pool or vdev) properly
* reflects the diagnosed state, when the fault corresponds to internal
* ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a
* wasn't available) and is now healthy will be mis-diagnosed.
*/
return;
}
/*
* If the vdev had an associated FRU, then get the FRU nvlist
* from the topo handle and use that in the suspect list. We
* explicitly lookup the FRU because the fmri reported from the
* kernel may not have up to date details about the disk itself
* (serial, part, etc).
*/
/*
* If the disk is part of the system chassis, but the
* FRU indicates a different chassis ID than our
* current system, then ignore the error. This
* indicates that the device was part of another
* cluster head, and for obvious reasons cannot be
* imported on this system.
*/
return;
}
/*
* If the device is no longer present on the system, or
* topo_fmri_fru() fails for other reasons, then fall
* back to the fmri specified in the vdev.
*/
}
}
}
if (serialize)
}
/*
* This #define and function access a private interface of the FMA
* framework. Ereports include a time-of-day upper bound.
* We want to look at that so we can compare it to when pools get
* loaded.
*/
#define FMD_EVN_TOD "__tod"
static boolean_t
{
}
/*ARGSUSED*/
static void
{
nelem == 2) {
} else {
}
}
/*
* Main fmd entry point.
*/
/*ARGSUSED*/
static void
{
/*
* We subscribe to notifications for vdev or pool removal. In these
* cases, there may be cases that no longer apply. Purge any cases
* that no longer apply.
*/
return;
}
if (isresource) {
/*
* For resources, we don't have a normal payload.
*/
&vdev_guid) != 0)
else
} else {
(void) nvlist_lookup_nvlist(nvl,
(void) nvlist_lookup_int32(nvl,
}
/*
* We also ignore all ereports generated during an import of a pool,
* since the only possible fault (.pool) would result in import failure,
* and hence no persistent fault. Some day we may want to do something
* with these ereports, so we continue generating them internally.
*/
if (pool_state == SPA_LOAD_IMPORT) {
return;
}
/*
* Device I/O errors are ignored during pool open.
*/
if (pool_state == SPA_LOAD_OPEN &&
return;
}
/*
* We ignore ereports for anything except disks and files.
*/
&type) == 0) {
return;
}
}
/*
* Determine if this ereport corresponds to an open case. Previous
* incarnations of this DE used the ENA to chain events together as
* part of the same case. The problem with this is that we rely on
* global uniqueness of cases based on (pool_guid, vdev_guid) pair when
* generating SERD engines. Instead, we have a case for each vdev or
* pool, regardless of the ENA.
*/
(void) nvlist_lookup_uint64(nvl,
if (nvlist_lookup_uint64(nvl,
vdev_guid = 0;
ena = 0;
pool_found = B_TRUE;
}
break;
}
if (pool_found) {
"ereport time %lld.%lld, pool load time = %lld.%lld\n",
}
/*
* Avoid falsely accusing a pool of being faulty. Do so by
* not replaying ereports that were generated prior to the
* current import. If the failure that generated them was
* transient because the device was actually removed but we
* didn't receive the normal asynchronous notification, we
* don't want to mark it as faulted and potentially panic. If
* there is still a problem we'd expect not to be able to
* import the pool, or that new ereports will be generated
* once the pool is used.
*/
return;
}
if (!pool_found) {
/*
* Haven't yet seen this pool, but same situation
* may apply.
*/
struct load_time_arg la;
pool_found = B_TRUE;
"ereport time %lld.%lld, "
"pool load time = %lld.%lld\n",
return;
}
}
}
fmd_case_t *cs;
zfs_case_data_t data = { 0 };
/*
* If this is one of our 'fake' resource ereports, and there is
* no case open, simply discard it.
*/
if (isresource) {
return;
}
/*
* Open a new case.
*/
/*
* Initialize the case buffer. To commonize code, we actually
* create the buffer with existing data, and then call
* zfs_case_unserialize() to instantiate the in-core structure.
*/
sizeof (zfs_case_data_t));
if (pool_found)
}
/*
* If this is an ereport for a case with an associated vdev FRU, make
* sure it is accurate and up to date.
*/
&fru) == 0) {
}
}
}
if (isresource) {
/*
* The 'resource.fs.zfs.autoreplace' event indicates
* that the pool was loaded with the 'autoreplace'
* property set. In this case, any pending device
* failures should be ignored, as the asynchronous
* autoreplace handling will take care of them.
*/
/*
* The 'resource.fs.zfs.removed' event indicates that
* device removal was detected, and the device was
* closed asynchronously. If this is the case, we
* assume that any recent I/O errors were due to the
* device removal, not any fault of the device itself.
* We reset the SERD engine, and cancel any pending
* timers.
*/
}
}
return;
}
/*
* Associate the ereport with this case.
*/
/*
* Don't do anything else if this case is already solved.
*/
return;
/*
* Determine if we should solve the case and generate a fault. We solve
* a case if:
*
* a. A pool failed to open (ereport.fs.zfs.pool)
* b. A device failed to open (ereport.fs.zfs.pool) while a pool
* was up and running.
*
* We may see a series of ereports associated with a pool open, all
* chained together by the same ENA. If the pool open succeeds, then
* we'll see no further ereports. To detect when a pool open has
* succeeded, we associate a timer with the event. When it expires, we
* close the case.
*/
/*
* Pool level fault. Before solving the case, go through and
* close any open device cases that may be pending.
*/
}
/*
* Pool level fault for reading the intent logs.
*/
/*
* Device fault.
*/
/*
* If this is a checksum or I/O error, then toss it into the
* appropriate SERD engine and check to see if it has fired.
* Ideally, we want to do something more sophisticated,
* (persistent errors for a single data block, etc). For now,
* a single SERD engine is sufficient.
*/
}
}
if (fmd_serd_record(hdl,
"fault.fs.zfs.vdev.checksum", B_FALSE);
}
FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
"fault.fs.zfs.io_failure_continue",
B_FALSE);
strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
"fault.fs.zfs.io_failure_wait", B_FALSE);
}
}
/*
* Because I/O errors may be due to device removal, we postpone
* any diagnosis until we're sure that we aren't about to
* receive a 'resource.fs.zfs.removed' event.
*/
if (checkremove) {
}
}
}
}
/*
* The timeout is fired when we diagnosed an I/O error, and it was not due to
* device removal (which would cause the timeout to be cancelled).
*/
/* ARGSUSED */
static void
{
}
static void
{
}
/*
* We use the fmd gc entry point to look for old cases that no longer apply.
* This allows us to keep our set of case data small in a long running system.
*/
static void
{
}
static const fmd_hdl_ops_t fmd_ops = {
zfs_fm_recv, /* fmdo_recv */
zfs_fm_timeout, /* fmdo_timeout */
zfs_fm_close, /* fmdo_close */
NULL, /* fmdo_stats */
zfs_fm_gc, /* fmdo_gc */
};
static const fmd_prop_t fmd_props[] = {
};
static const fmd_hdl_info_t fmd_info = {
};
void
{
fmd_case_t *cp;
return;
return;
}
return;
}
return;
}
/*
* Iterate over all active cases and unserialize the associated buffers,
* adding them to our list of open cases.
*/
/*
* Clear out any old cases that are no longer valid.
*/
}
void
{
/*
* Remove all active cases.
*/
}
}