zfs_mod.c revision 6401734d545a04c18f68b448202f9d9a77216bb9
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/*
* ZFS syseventd module.
*
* The purpose of this module is to identify when devices are added to the
* system, and appropriately online or replace the affected vdevs.
*
* When a device is added to the system:
*
* 1. Search for any vdevs whose devid matches that of the newly added
* device.
*
* 2. If no vdevs are found, then search for any vdevs whose devfs path
* matches that of the new device.
*
* 3. If no vdevs match by either method, then ignore the event.
*
* 4. Attempt to online the device with a flag to indicate that it should
* be unspared when resilvering completes. If this succeeds, then the
* same device was inserted and we should continue normally.
*
* 5. If the pool does not have the 'autoreplace' property set, attempt to
* online the device again without the unspare flag, which will
* generate a FMA fault.
*
* 6. If the pool has the 'autoreplace' property set, and the matching vdev
* is a whole disk, then label the new disk and attempt a 'zpool
* replace'.
*
* The module responds to EC_DEV_ADD events for both disks and lofi devices,
* with the latter used for testing. The special ESC_ZFS_VDEV_CHECK event
* indicates that a device failed to open during pool load, but the autoreplace
* property was set. In this case, we deferred the associated FMA fault until
* our module had a chance to process the autoreplace logic. If the device
* could not be replaced, then the second online attempt will trigger the FMA
* fault that we skipped earlier.
*/
#include <alloca.h>
#include <devid.h>
#include <fcntl.h>
#include <libnvpair.h>
#include <libsysevent.h>
#include <libzfs.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <thread_pool.h>
#include <unistd.h>
#include "syseventd.h"
#define PHYS_PATH ":q"
#define RAW_SLICE "p0"
#define PHYS_PATH ":c"
#define RAW_SLICE "s2"
#else
#endif
typedef struct unavailpool {
int
{
unsigned int c;
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
}
static int
{
} else {
}
return (0);
}
/*
* The device associated with the given vdev (either by devid or physical path)
* has been added to the system. If 'isdisk' is set, then we only attempt a
* replacement if it's a whole disk. This also implies that we should label the
* disk first.
*
* First, we attempt to online the device (making sure to undo any spare
* operation when finished). If this succeeds, then we're done. If it fails,
* and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
* but that the label was not what we expected. If the 'autoreplace' property
* is not set, then we relabel the disk (if specified), and attempt a 'zpool
* replace'. If the online is successful, but the new state is something else
* (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
* race, and we should avoid attempting to relabel the disk.
*/
static void
{
char *path;
return;
/*
* We should have a way to online a device by guid. With the current
* interface, we are forced to chop off the 's0' for whole disks.
*/
if (wholedisk)
/*
* Attempt to online the device. It would be nice to online this by
* GUID, but the current interface only supports lookup by path.
*/
if (offline ||
(newstate == VDEV_STATE_HEALTHY ||
newstate == VDEV_STATE_DEGRADED)))
return;
/*
* If the pool doesn't have the autoreplace property set, then attempt a
* true online (without the unspare flag), which will trigger a FMA
* fault.
*/
&newstate);
return;
}
if (isdisk) {
/*
* If this is a request to label a whole disk, then attempt to
* write out the label. Before we can label the disk, we need
* access to a raw node. Ideally, we'd like to walk the devinfo
* tree and find a raw node from the corresponding parent node.
* This is overly complicated, and since we know how we labeled
* this device in the first place, we know it's save to switch
*
* If any part of this process fails, then do a force online to
* trigger a ZFS fault for the device (and any hot spare
* replacement).
*/
strlen(ZFS_DISK_ROOTD)) != 0) {
return;
}
return;
}
}
/*
* Cosntruct the root vdev to pass to zpool_vdev_attach(). While adding
* the entire vdev structure is harmless, we construct a reduced set of
*/
return;
return;
}
ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
1) != 0) {
return;
}
}
/*
* Utility functions to find a vdev matching given criteria.
*/
typedef struct dev_data {
const char *dd_compare;
const char *dd_prop;
} dev_data_t;
static void
{
char *path;
/*
* First iterate over any children.
*/
for (c = 0; c < children; c++)
return;
}
if (dp->dd_vdev_guid != 0) {
return;
return;
/*
* Normally, we want to have an exact match for the comparison
* string. However, we allow substring matches in the following
* cases:
*
* <path>: This is a devpath, and the target is one
* of its children.
*
* <path/> This is a devid for a whole disk, and
* the target is one of its children.
*/
return;
}
}
void
zfs_enable_ds(void *arg)
{
}
static int
{
if (dp->dd_pool_guid == 0 ||
(void) nvlist_lookup_nvlist(config,
}
}
if (g_enumeration_done) {
continue;
pool);
break;
}
}
}
return (0);
}
/*
* Given a physical device path, iterate over all (pool, vdev) pairs which
* correspond to the given path.
*/
static boolean_t
{
dev_data_t data = { 0 };
}
/*
* Given a /devices path, lookup the corresponding devid for each minor node,
* and find any vdevs with matching devids. Doing this straight up would be
* rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
* the fact that each devid ends with "/<minornode>". Once we find any valid
* minor node, we chop off the portion after the last slash, and then search for
* matching vdevs, which is O(vdevs in system).
*/
static boolean_t
{
sizeof (PHYS_PATH) - 1;
char *fullpath;
int fd;
dev_data_t data = { 0 };
/*
* Try to open a known minor node.
*/
return (B_FALSE);
/*
* Determine the devid as a string, with no trailing slash for the minor
* node.
*/
return (B_FALSE);
}
return (B_FALSE);
}
}
/*
* This function is called when we receive a devfs add event. This can be
* either a disk event or a lofi event, and the behavior is slightly different
* depending on which it is.
*/
static int
{
int ret;
/*
* The main unit of operation is the physical device path. For disks,
* this is the device node, as all minor nodes are affected. For lofi
* devices, this includes the minor path. Unfortunately, this isn't
* represented in the DEV_PHYS_PATH for various reasons.
*/
return (-1);
/*
* If this is a lofi device, then also get the minor instance name.
* Unfortunately, the current payload doesn't include an easy way to get
* this information. So we cheat by resolving the 'dev_name' (which
* refers to the raw device) and taking the portion between ':(*),raw'.
*/
if (is_lofi) {
&devname) == 0 &&
sizeof (path))) > 0) {
*raw = '\0';
sizeof (realpath), "%s%s",
*raw = ',';
}
}
}
/*
* Iterate over all vdevs with a matching devid, and then those with a
* matching /devices path. For disks, we only want to pay attention to
* vdevs marked as whole disks. For lofi, we don't care (because we're
* matching an exact minor name).
*/
return (0);
}
/*
* Called when we receive a VDEV_CHECK event, which indicates a device could not
* be opened during initial pool open, but the autoreplace property was set on
* the pool. In this case, we treat it as if it were an add event.
*/
static int
{
dev_data_t data = { 0 };
&data.dd_pool_guid) != 0 ||
&data.dd_vdev_guid) != 0 ||
data.dd_vdev_guid == 0)
return (0);
return (0);
}
#define DEVICE_PREFIX "/devices"
static int
{
&path) == 0);
&wholedisk) == 0);
if (wholedisk) {
/*
* We need to reopen the pool associated with this
* device so that the kernel can update the size
* of the expanded device.
*/
(void) zpool_reopen(zhp);
}
" device %s to ONLINE state in pool %s.\n",
&newstate);
}
return (1);
}
return (0);
}
/*
* This function is called for each vdev of a pool for which any of the
* following events was recieved:
* - ESC_ZFS_vdev_add
* - ESC_ZFS_vdev_attach
* - ESC_ZFS_vdev_clear
* - ESC_ZFS_vdev_online
* - ESC_ZFS_pool_create
* - ESC_ZFS_pool_import
* It will update the vdevs FRU property if it is out of date.
*/
/*ARGSUSED2*/
static void
{
const char *newfru;
/* remove :<slice> from devpath */
*cptr = '\0';
devpath);
return;
}
/* do nothing if the FRU hasn't changed */
return;
}
}
/*
* This function handles the following events:
* - ESC_ZFS_vdev_add
* - ESC_ZFS_vdev_attach
* - ESC_ZFS_vdev_clear
* - ESC_ZFS_vdev_online
* - ESC_ZFS_pool_create
* - ESC_ZFS_pool_import
* It will iterate over the pool vdevs to update the FRU property.
*/
int
{
dev_data_t dd = { 0 };
char *pname;
return (-1);
}
/*
* If this event was triggered by a pool export or destroy we cannot
* open the pool. This is not an error, just return 0 as we don't care
* about these events.
*/
return (0);
"failed to get pool config for %s\n", pname);
return (-1);
}
syseventd_print(0, "zfs_deliver_update: "
"failed to get vdev tree for %s\n", pname);
return (-1);
}
return (0);
}
int
{
char *devname;
return (-1);
}
"device '%s'", devname);
return (-1);
}
/*
* We try to find the device using the physical
* path that has been supplied. We need to strip off
* the /devices prefix before starting our search.
*/
" found\n", devname);
return (1);
}
return (0);
}
/*ARGSUSED*/
static int
{
int ret;
/*
* We're mainly interested in disk additions, but we also listen
* for new lofi devices, to allow for simplified testing.
*/
else
return (0);
/*
* This event signifies that a device failed to open
* during pool load, but the 'autoreplace' property was
* set, so we should pretend it's just been added.
*/
/*
* When we receive these events we check the pool
* configuration and update the vdev FRUs if necessary.
*/
}
} else {
return (0);
}
return (-1);
if (is_dle)
else if (is_update)
else if (is_check)
else
return (ret);
}
/*ARGSUSED*/
void *
zfs_enum_pools(void *arg)
{
if (!list_is_empty(&g_pool_list))
0, NULL);
return (NULL);
}
static struct slm_mod_ops zfs_mod_ops = {
};
struct slm_mod_ops *
slm_init()
{
return (NULL);
/*
* collect a list of unavailable pools (asynchronously,
* since this can take a while)
*/
return (NULL);
return (&zfs_mod_ops);
}
void
slm_fini()
{
}
}
}