zfs_mod.c revision b01c3b58f7eb7fb570f606f96f130fb9b2018b49
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * CDDL HEADER START
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * The contents of this file are subject to the terms of the
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Common Development and Distribution License (the "License").
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * You may not use this file except in compliance with the License.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * See the License for the specific language governing permissions
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * and limitations under the License.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * When distributing Covered Code, include this CDDL HEADER in each
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * If applicable, add the following below this CDDL HEADER, with the
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * fields enclosed by brackets "[]" replaced with your own identifying
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * information: Portions Copyright [yyyy] [name of copyright owner]
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * CDDL HEADER END
b01c3b58f7eb7fb570f606f96f130fb9b2018b49eschrock * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Use is subject to license terms.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock#pragma ident "%Z%%M% %I% %E% SMI"
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * ZFS syseventd module.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * The purpose of this module is to identify when devices are added to the
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * system, and appropriately online or replace the affected vdevs.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * When a device is added to the system:
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * 1. Search for any vdevs whose devid matches that of the newly added
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * 2. If no vdevs are found, then search for any vdevs whose devfs path
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * matches that of the new device.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * 3. If no vdevs match by either method, then ignore the event.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * 4. Attempt to online the device with a flag to indicate that it should
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * be unspared when resilvering completes. If this succeeds, then the
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * same device was inserted and we should continue normally.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * 5. If the pool does not have the 'autoreplace' property set, attempt to
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * online the device again without the unspare flag, which will
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * generate a FMA fault.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * 6. If the pool has the 'autoreplace' property set, and the matching vdev
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * is a whole disk, then label the new disk and attempt a 'zpool
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * replace'.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * The module responds to EC_DEV_ADD events for both disks and lofi devices,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * with the latter used for testing. The special ESC_ZFS_VDEV_CHECK event
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * indicates that a device failed to open during pool load, but the autoreplace
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * property was set. In this case, we deferred the associated FMA fault until
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * our module had a chance to process the autoreplace logic. If the device
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * could not be replaced, then the second online attempt will trigger the FMA
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * fault that we skipped earlier.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrocktypedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * The device associated with the given vdev (either by devid or physical path)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * has been added to the system. If 'isdisk' is set, then we only attempt a
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * replacement if it's a whole disk. This also implies that we should label the
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * disk first.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * First, we attempt to online the device (making sure to undo any spare
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * operation when finished). If this succeeds, then we're done. If it fails,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * but that the label was not what we expected. If the 'autoreplace' property
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * is not set, then we relabel the disk (if specified), and attempt a 'zpool
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * replace'. If the online is successful, but the new state is something else
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * race, and we should avoid attempting to relabel the disk.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrockzfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_DEVID, &devid);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * We should have a way to online a device by guid. With the current
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * interface, we are forced to chop off the 's0' for whole disks.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Attempt to online the device. It would be nice to online this by
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * GUID, but the current interface only supports lookup by path.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * If the pool doesn't have the autoreplace property set, then attempt a
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * true online (without the unspare flag), which will trigger a FMA
990b4856d0eaada6f8140335733a1b1771ed2746lling if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * If this is a request to label a whole disk, then attempt to
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * write out the label. Before we can label the disk, we need
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * access to a raw node. Ideally, we'd like to walk the devinfo
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * tree and find a raw node from the corresponding parent node.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * This is overly complicated, and since we know how we labeled
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * this device in the first place, we know it's save to switch
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * from /dev/dsk to /dev/rdsk and append the backup slice.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) strlcpy(rawpath, path + 9, sizeof (rawpath));
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Cosntruct the root vdev to pass to zpool_vdev_attach(). While adding
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * the entire vdev structure is harmless, we construct a reduced set of
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * path/devid/wholedisk to keep it simple.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (devid && nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Utility functions to find a vdev matching given criteria.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrocktypedef struct dev_data {
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock const char *dd_prop;
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrockzfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * First iterate over any children.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock for (c = 0; c < children; c++)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Normally, we want to have an exact match for the comparison
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * string. However, we allow substring matches in the following
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * <path>: This is a devpath, and the target is one
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * of its children.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * <path/> This is a devid for a whole disk, and
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * the target is one of its children.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if ((config = zpool_get_config(zhp, NULL)) != NULL) {
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (0);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Given a physical device path, iterate over all (pool, vdev) pairs which
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * correspond to the given path.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrockdevpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Given a /devices path, lookup the corresponding devid for each minor node,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * and find any vdevs with matching devids. Doing this straight up would be
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * the fact that each devid ends with "/<minornode>". Once we find any valid
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * minor node, we chop off the portion after the last slash, and then search for
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * matching vdevs, which is O(vdevs in system).
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrockdevid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock size_t len = strlen(devpath) + sizeof ("/devices") +
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Try to open a known minor node.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Determine the devid as a string, with no trailing slash for the minor
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if ((devidstr = devid_str_encode(devid, NULL)) == NULL) {
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * This function is called when we receive a devfs add event. This can be
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * either a disk event or a lofi event, and the behavior is slightly different
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * depending on which it is.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * The main unit of operation is the physical device path. For disks,
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * this is the device node, as all minor nodes are affected. For lofi
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * devices, this includes the minor path. Unfortunately, this isn't
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * represented in the DEV_PHYS_PATH for various reasons.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0)
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (-1);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * If this is a lofi device, then also get the minor instance name.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Unfortunately, the current payload doesn't include an easy way to get
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * this information. So we cheat by resolving the 'dev_name' (which
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * refers to the raw device) and taking the portion between ':(*),raw'.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) strlcpy(realpath, devpath, sizeof (realpath));
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock sizeof (path))) > 0) {
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Iterate over all vdevs with a matching devid, and then those with a
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * matching /devices path. For disks, we only want to pay attention to
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * vdevs marked as whole disks. For lofi, we don't care (because we're
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * matching an exact minor name).
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock if (!devid_iter(realpath, zfs_process_add, !is_lofi))
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock (void) devpath_iter(realpath, zfs_process_add, !is_lofi);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (0);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * Called when we receive a VDEV_CHECK event, which indicates a device could not
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * be opened during initial pool open, but the autoreplace property was set on
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * the pool. In this case, we treat it as if it were an add event.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (0);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (0);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock/*ARGSUSED*/
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock const char *subclass = sysevent_get_subclass_name(ev);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * We're mainly interested in disk additions, but we also listen
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * for new lofi devices, to allow for simplified testing.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (0);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * This event signifies that a device failed to open during pool
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * load, but the 'autoreplace' property was set, so we should
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock * pretend it's just been added.
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (0);
3d7072f8bd27709dba14f6fe336f149d25d9e207eschrock return (-1);