modules/zfs_mod/zfs_mod.c

	zfs_mod.c revision b01c3b58f7eb7fb570f606f96f130fb9b2018b49
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * ZFS syseventd module.
 *
 * The purpose of this module is to identify when devices are added to the
 * system, and appropriately online or replace the affected vdevs.
 *
 * When a device is added to the system:
 *
 *  1. Search for any vdevs whose devid matches that of the newly added
 *     device.
 *
 *  2. If no vdevs are found, then search for any vdevs whose devfs path
 *     matches that of the new device.
 *
 *  3. If no vdevs match by either method, then ignore the event.
 *
 *  4. Attempt to online the device with a flag to indicate that it should
 *     be unspared when resilvering completes.  If this succeeds, then the
 *     same device was inserted and we should continue normally.
 *
 *  5. If the pool does not have the 'autoreplace' property set, attempt to
 *     online the device again without the unspare flag, which will
 *     generate a FMA fault.
 *
 *  6. If the pool has the 'autoreplace' property set, and the matching vdev
 *     is a whole disk, then label the new disk and attempt a 'zpool
 *     replace'.
 *
 * The module responds to EC_DEV_ADD events for both disks and lofi devices,
 * with the latter used for testing.  The special ESC_ZFS_VDEV_CHECK event
 * indicates that a device failed to open during pool load, but the autoreplace
 * property was set.  In this case, we deferred the associated FMA fault until
 * our module had a chance to process the autoreplace logic.  If the device
 * could not be replaced, then the second online attempt will trigger the FMA
 * fault that we skipped earlier.
 */

#include <alloca.h>
#include <devid.h>
#include <fcntl.h>
#include <libnvpair.h>
#include <libsysevent.h>
#include <libzfs.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <sys/sunddi.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/dev.h>
#include <unistd.h>

#if defined(__i386) || defined(__amd64)
#define PHYS_PATH   ":q"
#define RAW_SLICE   "p0"
#elif defined(__sparc)
#define PHYS_PATH   ":c"
#define RAW_SLICE   "s2"
#else
#error Unknown architecture
#endif

typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);

libzfs_handle_t *g_zfshdl;

/*
 * The device associated with the given vdev (either by devid or physical path)
 * has been added to the system.  If 'isdisk' is set, then we only attempt a
 * replacement if it's a whole disk.  This also implies that we should label the
 * disk first.
 *
 * First, we attempt to online the device (making sure to undo any spare
 * operation when finished).  If this succeeds, then we're done.  If it fails,
 * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
 * but that the label was not what we expected.  If the 'autoreplace' property
 * is not set, then we relabel the disk (if specified), and attempt a 'zpool
 * replace'.  If the online is successful, but the new state is something else
 * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
 * race, and we should avoid attempting to relabel the disk.
 */
static void
zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
{
    char *path;
    vdev_state_t newstate;
    nvlist_t *nvroot, *newvd;
    uint64_t wholedisk = 0ULL;
    char *devid = NULL;
    char rawpath[PATH_MAX], fullpath[PATH_MAX];
    size_t len;

    if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
        return;

    (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_DEVID, &devid);
    (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);

    /*
     * We should have a way to online a device by guid.  With the current
     * interface, we are forced to chop off the 's0' for whole disks.
     */
    (void) strlcpy(fullpath, path, sizeof (fullpath));
    if (wholedisk)
        fullpath[strlen(fullpath) - 2] = '\0';

    /*
     * Attempt to online the device.  It would be nice to online this by
     * GUID, but the current interface only supports lookup by path.
     */
    if (zpool_vdev_online(zhp, fullpath,
        ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
        newstate != VDEV_STATE_CANT_OPEN)
        return;

    /*
     * If the pool doesn't have the autoreplace property set, then attempt a
     * true online (without the unspare flag), which will trigger a FMA
     * fault.
     */
    if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
        (isdisk && !wholedisk)) {
        (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
            &newstate);
        return;
    }

    if (isdisk) {
        /*
         * If this is a request to label a whole disk, then attempt to
         * write out the label.  Before we can label the disk, we need
         * access to a raw node.  Ideally, we'd like to walk the devinfo
         * tree and find a raw node from the corresponding parent node.
         * This is overly complicated, and since we know how we labeled
         * this device in the first place, we know it's save to switch
         * from /dev/dsk to /dev/rdsk and append the backup slice.
         */
        if (strncmp(path, "/dev/dsk/", 9) != 0)
            return;

        (void) strlcpy(rawpath, path + 9, sizeof (rawpath));
        len = strlen(rawpath);
        rawpath[len - 2] = '\0';

        if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0)
            return;
    }

    /*
     * Cosntruct the root vdev to pass to zpool_vdev_attach().  While adding
     * the entire vdev structure is harmless, we construct a reduced set of
     * path/devid/wholedisk to keep it simple.
     */
    if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
        return;

    if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
        nvlist_free(nvroot);
        return;
    }

    if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
        nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
        (devid && nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID,
        devid) != 0) ||
        nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
        nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
        nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
        1) != 0) {
        nvlist_free(newvd);
        nvlist_free(nvroot);
        return;
    }

    nvlist_free(newvd);

    (void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);

    nvlist_free(nvroot);

}

/*
 * Utility functions to find a vdev matching given criteria.
 */
typedef struct dev_data {
    const char      *dd_compare;
    const char      *dd_prop;
    zfs_process_func_t  dd_func;
    boolean_t       dd_found;
    boolean_t       dd_isdisk;
    uint64_t        dd_pool_guid;
    uint64_t        dd_vdev_guid;
} dev_data_t;

static void
zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
{
    dev_data_t *dp = data;
    char *path;
    uint_t c, children;
    nvlist_t **child;
    size_t len;
    uint64_t guid;

    /*
     * First iterate over any children.
     */
    if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
        &child, &children) == 0) {
        for (c = 0; c < children; c++)
            zfs_iter_vdev(zhp, child[c], data);
        return;
    }

    if (dp->dd_vdev_guid != 0) {
        if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
            &guid) != 0 || guid != dp->dd_vdev_guid)
            return;
    } else {
        len = strlen(dp->dd_compare);

        if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
            strncmp(dp->dd_compare, path, len) != 0)
            return;

        /*
         * Normally, we want to have an exact match for the comparison
         * string.  However, we allow substring matches in the following
         * cases:
         *
         *  <path>:     This is a devpath, and the target is one
         *          of its children.
         *
         *  <path/>     This is a devid for a whole disk, and
         *          the target is one of its children.
         */
        if (path[len] != '\0' && path[len] != ':' &&
            path[len - 1] != '/')
            return;
    }

    (dp->dd_func)(zhp, nvl, dp->dd_isdisk);
}

static int
zfs_iter_pool(zpool_handle_t *zhp, void *data)
{
    nvlist_t *config, *nvl;
    dev_data_t *dp = data;
    uint64_t pool_guid;

    if ((config = zpool_get_config(zhp, NULL)) != NULL) {
        if (dp->dd_pool_guid == 0 ||
            (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
            &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
            (void) nvlist_lookup_nvlist(config,
                ZPOOL_CONFIG_VDEV_TREE, &nvl);
            zfs_iter_vdev(zhp, nvl, data);
        }
    }

    zpool_close(zhp);
    return (0);
}

/*
 * Given a physical device path, iterate over all (pool, vdev) pairs which
 * correspond to the given path.
 */
static boolean_t
devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
{
    dev_data_t data = { 0 };

    data.dd_compare = devpath;
    data.dd_func = func;
    data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
    data.dd_found = B_FALSE;
    data.dd_isdisk = wholedisk;

    (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);

    return (data.dd_found);
}

/*
 * Given a /devices path, lookup the corresponding devid for each minor node,
 * and find any vdevs with matching devids.  Doing this straight up would be
 * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
 * the fact that each devid ends with "/<minornode>".  Once we find any valid
 * minor node, we chop off the portion after the last slash, and then search for
 * matching vdevs, which is O(vdevs in system).
 */
static boolean_t
devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
{
    size_t len = strlen(devpath) + sizeof ("/devices") +
        sizeof (PHYS_PATH) - 1;
    char *fullpath;
    int fd;
    ddi_devid_t devid;
    char *devidstr, *fulldevid;
    dev_data_t data = { 0 };

    /*
     * Try to open a known minor node.
     */
    fullpath = alloca(len);
    (void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH);
    if ((fd = open(fullpath, O_RDONLY)) < 0)
        return (B_FALSE);

    /*
     * Determine the devid as a string, with no trailing slash for the minor
     * node.
     */
    if (devid_get(fd, &devid) != 0) {
        (void) close(fd);
        return (B_FALSE);
    }
    (void) close(fd);

    if ((devidstr = devid_str_encode(devid, NULL)) == NULL) {
        devid_free(devid);
        return (B_FALSE);
    }

    len = strlen(devidstr) + 2;
    fulldevid = alloca(len);
    (void) snprintf(fulldevid, len, "%s/", devidstr);

    data.dd_compare = fulldevid;
    data.dd_func = func;
    data.dd_prop = ZPOOL_CONFIG_DEVID;
    data.dd_found = B_FALSE;
    data.dd_isdisk = wholedisk;

    (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);

    devid_str_free(devidstr);

    return (data.dd_found);
}

/*
 * This function is called when we receive a devfs add event.  This can be
 * either a disk event or a lofi event, and the behavior is slightly different
 * depending on which it is.
 */
static int
zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
{
    char *devpath, *devname;
    char path[PATH_MAX], realpath[PATH_MAX];
    char *colon, *raw;
    int ret;

    /*
     * The main unit of operation is the physical device path.  For disks,
     * this is the device node, as all minor nodes are affected.  For lofi
     * devices, this includes the minor path.  Unfortunately, this isn't
     * represented in the DEV_PHYS_PATH for various reasons.
     */
    if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0)
        return (-1);

    /*
     * If this is a lofi device, then also get the minor instance name.
     * Unfortunately, the current payload doesn't include an easy way to get
     * this information.  So we cheat by resolving the 'dev_name' (which
     * refers to the raw device) and taking the portion between ':(*),raw'.
     */
    (void) strlcpy(realpath, devpath, sizeof (realpath));
    if (is_lofi) {
        if (nvlist_lookup_string(nvl, DEV_NAME,
            &devname) == 0 &&
            (ret = resolvepath(devname, path,
            sizeof (path))) > 0) {
            path[ret] = '\0';
            colon = strchr(path, ':');
            if (colon != NULL)
                raw = strstr(colon + 1, ",raw");
            if (colon != NULL && raw != NULL) {
                *raw = '\0';
                (void) snprintf(realpath,
                    sizeof (realpath), "%s%s",
                    devpath, colon);
                *raw = ',';
            }
        }
    }

    /*
     * Iterate over all vdevs with a matching devid, and then those with a
     * matching /devices path.  For disks, we only want to pay attention to
     * vdevs marked as whole disks.  For lofi, we don't care (because we're
     * matching an exact minor name).
     */
    if (!devid_iter(realpath, zfs_process_add, !is_lofi))
        (void) devpath_iter(realpath, zfs_process_add, !is_lofi);

    return (0);
}

/*
 * Called when we receive a VDEV_CHECK event, which indicates a device could not
 * be opened during initial pool open, but the autoreplace property was set on
 * the pool.  In this case, we treat it as if it were an add event.
 */
static int
zfs_deliver_check(nvlist_t *nvl)
{
    dev_data_t data = { 0 };

    if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
        &data.dd_pool_guid) != 0 ||
        nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
        &data.dd_vdev_guid) != 0)
        return (0);

    data.dd_isdisk = B_TRUE;
    data.dd_func = zfs_process_add;

    (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);

    return (0);
}

/*ARGSUSED*/
static int
zfs_deliver_event(sysevent_t *ev, int unused)
{
    const char *class = sysevent_get_class_name(ev);
    const char *subclass = sysevent_get_subclass_name(ev);
    nvlist_t *nvl;
    int ret;
    boolean_t is_lofi, is_check;

    if (strcmp(class, EC_DEV_ADD) == 0) {
        /*
         * We're mainly interested in disk additions, but we also listen
         * for new lofi devices, to allow for simplified testing.
         */
        if (strcmp(subclass, ESC_DISK) == 0)
            is_lofi = B_FALSE;
        else if (strcmp(subclass, ESC_LOFI) == 0)
            is_lofi = B_TRUE;
        else
            return (0);

        is_check = B_FALSE;
    } else if (strcmp(class, EC_ZFS) == 0 &&
        strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
        /*
         * This event signifies that a device failed to open during pool
         * load, but the 'autoreplace' property was set, so we should
         * pretend it's just been added.
         */
        is_check = B_TRUE;
    } else {
        return (0);
    }

    if (sysevent_get_attr_list(ev, &nvl) != 0)
        return (-1);

    if (is_check)
        ret = zfs_deliver_check(nvl);
    else
        ret = zfs_deliver_add(nvl, is_lofi);


    nvlist_free(nvl);
    return (ret);
}

static struct slm_mod_ops zfs_mod_ops = {
    SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event
};

struct slm_mod_ops *
slm_init()
{
    if ((g_zfshdl = libzfs_init()) == NULL)
        return (NULL);

    return (&zfs_mod_ops);
}

void
slm_fini()
{
}