libshadowfs/common/shadow_migrate.c

	shadow_migrate.c revision 2
2N/A/*
2N/A * CDDL HEADER START
2N/A *
2N/A * The contents of this file are subject to the terms of the
2N/A * Common Development and Distribution License (the "License").
2N/A * You may not use this file except in compliance with the License.
2N/A *
2N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A * or http://www.opensolaris.org/os/licensing.
2N/A * See the License for the specific language governing permissions
2N/A * and limitations under the License.
2N/A *
2N/A * When distributing Covered Code, include this CDDL HEADER in each
2N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A * If applicable, add the following below this CDDL HEADER, with the
2N/A * fields enclosed by brackets "[]" replaced with your own identifying
2N/A * information: Portions Copyright [yyyy] [name of copyright owner]
2N/A *
2N/A * CDDL HEADER END
2N/A */
2N/A/*
2N/A * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
2N/A */
2N/A
2N/A/*
2N/A * This file contains the infrastructure to migrate files and directories.
2N/A */
2N/A
2N/A#include "shadow_impl.h"
2N/A
2N/Astatic size_t shadow_fid_load_max = 10000;
2N/A
2N/Astatic int
2N/Ashadow_add_entry(shadow_handle_t *shp, const char *path, const char *entry,
2N/A    shadow_type_t type, uint32_t depth, struct stat64 *statbuf)
2N/A{
2N/A    shadow_entry_t *sep;
2N/A    size_t len;
2N/A    struct statvfs64 vstat;
2N/A
2N/A    if ((sep = shadow_zalloc(sizeof (shadow_entry_t))) == NULL)
2N/A        return (-1);
2N/A
2N/A    if (entry == NULL) {
2N/A        if ((sep->se_path = shadow_strdup(path)) == NULL) {
2N/A            free(sep);
2N/A            return (-1);
2N/A        }
2N/A    } else {
2N/A        len = strlen(path) + strlen(entry) + 2;
2N/A        if ((sep->se_path = shadow_alloc(len)) == NULL) {
2N/A            free(sep);
2N/A            return (-1);
2N/A        }
2N/A
2N/A        (void) snprintf(sep->se_path, len, "%s/%s", path, entry);
2N/A    }
2N/A
2N/A    /*
2N/A     * If this directory is part of a different filesystem, then stop the
2N/A     * traversal rather than wasting time traversing the subdirectory.  The
2N/A     * implementation of 'f_fsid' leaves something to be desired, but since
2N/A     * this is just a suggestion, it's harmless if we're wrong.
2N/A     */
2N/A    if (shp->sh_fsid != NODEV &&
2N/A        statvfs64(sep->se_path, &vstat) == 0 &&
2N/A        vstat.f_fsid != shp->sh_fsid) {
2N/A        free(sep->se_path);
2N/A        free(sep);
2N/A        return (0);
2N/A    }
2N/A
2N/A    if (statbuf != NULL)
2N/A        sep->se_timestamp = statbuf->st_atim;
2N/A    sep->se_depth = depth;
2N/A    sep->se_type = type;
2N/A
2N/A    (void) pthread_mutex_lock(&shp->sh_lock);
2N/A    if (shadow_pq_enqueue(&shp->sh_queue, sep) != 0) {
2N/A        (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A        free(sep->se_path);
2N/A        free(sep);
2N/A        return (-1);
2N/A    }
2N/A    shadow_status_enqueue(shp, sep);
2N/A    (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A    return (0);
2N/A}
2N/A
2N/Astatic fid_t *
2N/Ashadow_read_fidlist(const char *root, int idx, size_t *count)
2N/A{
2N/A    int fd;
2N/A    size_t retlen;
2N/A    struct stat64 statbuf;
2N/A    vfs_shadow_header_t header;
2N/A    fid_t *ret;
2N/A    size_t fidlen = sizeof (ret->un._fid);
2N/A    int i;
2N/A    char path[PATH_MAX];
2N/A
2N/A    (void) snprintf(path, sizeof (path), "%s/%s/%s/%d", root,
2N/A        VFS_SHADOW_PRIVATE_DIR, VFS_SHADOW_PRIVATE_PENDING,
2N/A        idx);
2N/A
2N/A    if ((fd = open(path, O_RDONLY)) < 0)
2N/A        return (NULL);
2N/A
2N/A    if (fstat64(fd, &statbuf) != 0) {
2N/A        (void) close(fd);
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    if (statbuf.st_size < sizeof (vfs_shadow_header_t)) {
2N/A        (void) close(fd);
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    if (read(fd, &header, sizeof (header)) < sizeof (header)) {
2N/A        (void) close(fd);
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    if (header.vsh_magic != VFS_SHADOW_ATTR_LIST_MAGIC ||
2N/A        header.vsh_version != VFS_SHADOW_INTENT_VERSION) {
2N/A        (void) close(fd);
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    /* XXX verify endianness */
2N/A
2N/A    retlen = statbuf.st_size - sizeof (header);
2N/A    if (retlen % fidlen != 0) {
2N/A        (void) close(fd);
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    *count = retlen / fidlen;
2N/A
2N/A    /*
2N/A     * If the size of the pending lists exceeds a reasonable size, then
2N/A     * bail out.  While we try to keep the FID lists short, there are times
2N/A     * (such as when there are a large number of errors) when the lists
2N/A     * grow very large.  If this is the case, then it's probably not worth
2N/A     * trying to load and resume the migration from this list, and we're
2N/A     * better off just loading the root directory and starting from
2N/A     * scratch.
2N/A     */
2N/A    if (*count > shadow_fid_load_max) {
2N/A        (void) close(fd);
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    if ((ret = shadow_alloc(*count * sizeof (fid_t))) == NULL) {
2N/A        (void) close(fd);
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    for (i = 0; i < *count; i++) {
2N/A        if (pread64(fd, ret + i, fidlen,
2N/A            sizeof (header) + fidlen * i) != fidlen) {
2N/A            free(ret);
2N/A            (void) close(fd);
2N/A            return (NULL);
2N/A        }
2N/A    }
2N/A
2N/A    (void) close(fd);
2N/A    return (ret);
2N/A}
2N/A
2N/Atypedef struct shadow_fid_entry {
2N/A    fid_t           sfe_fid;
2N/A    shadow_hash_link_t  sfe_link;
2N/A} shadow_fid_entry_t;
2N/A
2N/Astatic const void *
2N/Ashadow_fid_hash_convert(const void *p)
2N/A{
2N/A    const shadow_fid_entry_t *fep = p;
2N/A
2N/A    return (&fep->sfe_fid);
2N/A}
2N/A
2N/Astatic ulong_t
2N/Ashadow_fid_hash_compute(const void *p)
2N/A{
2N/A    const fid_t *fidp = p;
2N/A    ulong_t hash = 0;
2N/A    int i;
2N/A
2N/A    for (i = 0; i < fidp->fid_len; i++)
2N/A        hash += fidp->fid_data[i];
2N/A
2N/A    return (hash);
2N/A}
2N/A
2N/Astatic int
2N/Ashadow_fid_hash_compare(const void *a, const void *b)
2N/A{
2N/A    const fid_t *fa = a;
2N/A    const fid_t *fb = b;
2N/A
2N/A    if (fa->fid_len != fb->fid_len)
2N/A        return (-1);
2N/A
2N/A    return (bcmp(fa->fid_data, fb->fid_data, fa->fid_len));
2N/A}
2N/A
2N/A/*
2N/A * Iterate over the given pending FID list and add entries for each item in the
2N/A * list.
2N/A */
2N/Astatic int
2N/Ashadow_load_fidlist(shadow_handle_t *shp, shadow_hash_t *seen, int idx)
2N/A{
2N/A    fid_t *fids;
2N/A    size_t i, count, depth;
2N/A    char *buf, *newbuf, *slash;
2N/A    size_t buflen, mountlen, mountptlen;
2N/A    shadow_ioc_t ioc;
2N/A    int fd;
2N/A    struct stat64 statbuf;
2N/A    shadow_type_t type;
2N/A    int ret = -1;
2N/A    shadow_fid_entry_t *fep;
2N/A
2N/A    if ((fids = shadow_read_fidlist(shp->sh_mountpoint, idx,
2N/A        &count)) == NULL)
2N/A        return (0);
2N/A
2N/A    if ((fd = open(shp->sh_mountpoint, O_RDONLY)) < 0) {
2N/A        free(fids);
2N/A        return (0);
2N/A    }
2N/A
2N/A    if ((buf = shadow_alloc(PATH_MAX)) == NULL) {
2N/A        free(fids);
2N/A        (void) close(fd);
2N/A        return (-1);
2N/A    }
2N/A
2N/A    mountptlen = strlen(shp->sh_mountpoint);
2N/A
2N/A    buflen = PATH_MAX;
2N/A    ioc.si_buffer = (uint64_t)(uintptr_t)buf;
2N/A    ioc.si_length = buflen;
2N/A
2N/A    mountlen = strlen(shp->sh_mountpoint);
2N/A
2N/A    for (i = 0; i < count; i++) {
2N/A        ioc.si_fid = fids[i];
2N/A
2N/A        if (ioc.si_fid.fid_len > MAXFIDSZ)
2N/A            continue;
2N/A
2N/A        if (ioctl(fd, SHADOW_IOC_FID2PATH, &ioc) != 0) {
2N/A            if (errno == ENOMEM) {
2N/A                if ((newbuf = shadow_alloc(
2N/A                    buflen * 2)) == NULL) {
2N/A                    goto error;
2N/A                }
2N/A
2N/A                free(buf);
2N/A                buf = newbuf;
2N/A                buflen *= 2;
2N/A                i--;
2N/A            }
2N/A            continue;
2N/A        }
2N/A
2N/A        if (buf[0] == '\0')
2N/A            continue;
2N/A
2N/A        /*
2N/A         * With two pending lists and the abilty for entries to appear
2N/A         * multiple times in a pending list, we want to make sure we
2N/A         * don't add the same entry twice.  For efficiency, we create a
2N/A         * hash based on FID and ignore those we've already seen.
2N/A         * Ideally, we'd like to avoid adding children if we've already
2N/A         * added a parent (which would visit the same child twice), but
2N/A         * this requires a more complicated data structure and should
2N/A         * hopefully be a rare occurrence.
2N/A         */
2N/A        if (shadow_hash_lookup(seen, &ioc.si_fid) != NULL)
2N/A            continue;
2N/A
2N/A        if ((fep = shadow_alloc(sizeof (shadow_fid_entry_t))) == NULL)
2N/A            goto error;
2N/A
2N/A        fep->sfe_fid = ioc.si_fid;
2N/A        shadow_hash_insert(seen, fep);
2N/A
2N/A        /*
2N/A         * If this is a relative path, it is the remote path and we
2N/A         * should turn it into a guess at the absolute path.
2N/A         */
2N/A        if (buf[0] != '/') {
2N/A            if (strlen(shp->sh_mountpoint) +
2N/A                strlen(buf) + 2 > buflen) {
2N/A                if ((newbuf = shadow_alloc(
2N/A                    buflen * 2)) == NULL)
2N/A                    goto error;
2N/A
2N/A                free(buf);
2N/A                buf = newbuf;
2N/A                buflen *= 2;
2N/A                i--;
2N/A                continue;
2N/A            }
2N/A
2N/A            (void) memmove(buf + mountlen + 1, buf,
2N/A                strlen(buf) + 1);
2N/A            (void) memcpy(buf, shp->sh_mountpoint, mountlen);
2N/A            buf[mountlen] = '/';
2N/A        }
2N/A
2N/A        if (strncmp(buf, shp->sh_mountpoint, mountlen) != 0)
2N/A            continue;
2N/A
2N/A        /*
2N/A         * When we first start migration, we have the root directory
2N/A         * and its contents in the pending list.  As a special case to
2N/A         * avoid looking at the entire hierarchy twice, we never add
2N/A         * the root directory to the pending list.  If there is some
2N/A         * error that is keeping the root directory from being
2N/A         * migrated, we'll discover it when we process the pending
2N/A         * list.
2N/A         */
2N/A        if (buf[mountptlen] == '\0')
2N/A            continue;
2N/A
2N/A        if (stat64(buf, &statbuf) != 0)
2N/A            continue;
2N/A
2N/A        if (S_ISREG(statbuf.st_mode)) {
2N/A            type = SHADOW_TYPE_FILE;
2N/A        } else if (S_ISDIR(statbuf.st_mode)) {
2N/A            type = SHADOW_TYPE_DIR;
2N/A        } else {
2N/A            continue;
2N/A        }
2N/A
2N/A        depth = 0;
2N/A        slash = buf + mountlen - 1;
2N/A        while ((slash = strchr(slash + 1, '/')) != NULL)
2N/A            depth++;
2N/A
2N/A        if (shadow_add_entry(shp, buf, NULL, type, depth,
2N/A            &statbuf) != 0)
2N/A            goto error;
2N/A    }
2N/A
2N/A    ret = 0;
2N/Aerror:
2N/A
2N/A    free(fids);
2N/A    free(buf);
2N/A    (void) close(fd);
2N/A    return (ret);
2N/A}
2N/A
2N/A/*
2N/A * This function is responsible for adding the initial directories to the list.
2N/A * In order to allow us to resume a previous migration, we make the assumption
2N/A * that the filesystem is largely static, and the remote paths are likely the
2N/A * same as the local ones.  By this token, we can iterate over the pending list
2N/A * and lookup the remote path for those FIDs that are not yet migrated.  As an
2N/A * extra check, we also look at the vnode path information as a second source
2N/A * of possible path information.  If everything fails, then we fall back to
2N/A * processing the FID list individually.  While not ideal, it gets the job
2N/A * done.  This is done asynchronously to the open, when the first migration is
2N/A * attempted.  Because we don't want to block reading the FID list when mounted
2N/A * in standby mode, we return an error if we're currently in standby mode.
2N/A */
2N/Astatic int
2N/Ashadow_begin(shadow_handle_t *shp)
2N/A{
2N/A    FILE *mnttab = NULL;
2N/A    struct mnttab mntent, search;
2N/A    char *mntopt;
2N/A    shadow_hash_t *seen;
2N/A    shadow_fid_entry_t *fep;
2N/A    int ret;
2N/A
2N/A    if ((mnttab = fopen(MNTTAB, "r")) == NULL)
2N/A        return (shadow_error(ESHADOW_NOMOUNT,
2N/A            dgettext(TEXT_DOMAIN, "failed to open /etc/mnttab")));
2N/A
2N/A    bzero(&search, sizeof (search));
2N/A    search.mnt_mountp = shp->sh_mountpoint;
2N/A    if (getmntany(mnttab, &mntent, &search) != 0) {
2N/A        /* shouldn't happen */
2N/A        (void) fclose(mnttab);
2N/A        return (shadow_error(ESHADOW_NOMOUNT,
2N/A            dgettext(TEXT_DOMAIN, "no such mountpoint %s"),
2N/A            shp->sh_mountpoint));
2N/A    }
2N/A
2N/A    if ((mntopt = hasmntopt(&mntent, "shadow")) != NULL &&
2N/A        strncmp(mntopt, "shadow=standby", 14) == 0) {
2N/A        (void) fclose(mnttab);
2N/A        return (shadow_error(ESHADOW_STANDBY,
2N/A            dgettext(TEXT_DOMAIN, "filesystem currently in standby")));
2N/A    }
2N/A
2N/A    (void) fclose(mnttab);
2N/A
2N/A    if ((seen = shadow_hash_create(offsetof(shadow_fid_entry_t, sfe_link),
2N/A        shadow_fid_hash_convert, shadow_fid_hash_compute,
2N/A        shadow_fid_hash_compare)) == NULL)
2N/A        return (-1);
2N/A
2N/A    ret = 0;
2N/A    if (shadow_load_fidlist(shp, seen, 0) != 0 ||
2N/A        shadow_load_fidlist(shp, seen, 1) != 0)
2N/A        ret = -1;
2N/A
2N/A    while ((fep = shadow_hash_first(seen)) != NULL) {
2N/A        shadow_hash_remove(seen, fep);
2N/A        free(fep);
2N/A    }
2N/A    shadow_hash_destroy(seen);
2N/A
2N/A    if (shadow_pq_peek(&shp->sh_queue) == NULL)
2N/A        (void) shadow_add_entry(shp, shp->sh_mountpoint, NULL,
2N/A            SHADOW_TYPE_DIR, 0, NULL);
2N/A
2N/A    return (ret);
2N/A}
2N/A
2N/A/*
2N/A * This function will go and load the pending FID list, if necessary.  It
2N/A * returns with the sh_lock held on success
2N/A */
2N/Astatic int
2N/Ashadow_check_begin(shadow_handle_t *shp)
2N/A{
2N/A    (void) pthread_mutex_lock(&shp->sh_lock);
2N/A    if (!shp->sh_loaded) {
2N/A        if (shp->sh_loading) {
2N/A            (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A            return (shadow_error(ESHADOW_MIGRATE_BUSY,
2N/A                dgettext(TEXT_DOMAIN,
2N/A                "pending FID list is currently being loaded")));
2N/A        }
2N/A
2N/A        shp->sh_loading = B_TRUE;
2N/A        (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A        if (shadow_begin(shp) != 0) {
2N/A            (void) pthread_mutex_lock(&shp->sh_lock);
2N/A            shp->sh_loading = B_FALSE;
2N/A            (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A            return (-1);
2N/A        }
2N/A
2N/A        (void) pthread_mutex_lock(&shp->sh_lock);
2N/A    }
2N/A
2N/A    shp->sh_loaded = B_TRUE;
2N/A    shp->sh_loading = B_FALSE;
2N/A
2N/A    return (0);
2N/A}
2N/A
2N/A/*
2N/A * This function is called during shadow_close() and is responsible for
2N/A * removing all items from the work queue and freeing up any errors seen.
2N/A */
2N/Avoid
2N/Ashadow_end(shadow_handle_t *shp)
2N/A{
2N/A    shadow_entry_t *sep;
2N/A    shadow_error_t *srp;
2N/A
2N/A    while ((sep = shadow_pq_dequeue(&shp->sh_queue)) != NULL) {
2N/A        free(sep->se_path);
2N/A        free(sep);
2N/A    }
2N/A
2N/A    while ((srp = shp->sh_errors) != NULL) {
2N/A        shp->sh_errors = srp->se_next;
2N/A        free(srp);
2N/A    }
2N/A}
2N/A
2N/A/*
2N/A * Record an error against the given path.  We first check to see if it's a
2N/A * known error, returning if it is.  Otherwise, we create an entry in the error
2N/A * list and record the relevant information.
2N/A */
2N/Astatic int
2N/Ashadow_error_record(shadow_handle_t *shp, const char *path, int err)
2N/A{
2N/A    shadow_error_t *sep;
2N/A
2N/A    (void) pthread_mutex_lock(&shp->sh_errlock);
2N/A    for (sep = shp->sh_errors; sep != NULL; sep = sep->se_next) {
2N/A        if (strcmp(sep->se_path, path) == 0) {
2N/A            sep->se_error = err;
2N/A            break;
2N/A        }
2N/A    }
2N/A
2N/A    if (sep == NULL) {
2N/A        if ((sep = shadow_zalloc(sizeof (shadow_error_t))) == NULL) {
2N/A            (void) pthread_mutex_unlock(&shp->sh_errlock);
2N/A            return (-1);
2N/A        }
2N/A
2N/A        if ((sep->se_path = shadow_strdup(path)) == NULL) {
2N/A            (void) pthread_mutex_unlock(&shp->sh_errlock);
2N/A            free(sep);
2N/A            return (-1);
2N/A        }
2N/A
2N/A        sep->se_error = err;
2N/A        sep->se_next = shp->sh_errors;
2N/A        shp->sh_errors = sep;
2N/A        shp->sh_errcount++;
2N/A    }
2N/A
2N/A    (void) pthread_mutex_unlock(&shp->sh_errlock);
2N/A    return (0);
2N/A}
2N/A
2N/A/*
2N/A * Called when migration fails for a file or directory.  In this case, we
2N/A * consult the kernel to get the remote path for the object.  If this fails,
2N/A * then we assume it's a local error and don't record the failure.  If it
2N/A * succeeds, it indicates there was a problem with the remote side, and we do
2N/A * record the error.
2N/A */
2N/Astatic int
2N/Ashadow_error_check(shadow_handle_t *shp, const char *localpath, int err)
2N/A{
2N/A    char path[PATH_MAX];
2N/A    shadow_ioc_t ioc;
2N/A    int fd;
2N/A
2N/A    if (err == 0 || err == EINTR)
2N/A        return (0);
2N/A
2N/A    if ((fd = open(localpath, O_RDONLY)) < 0)
2N/A        return (0);
2N/A
2N/A    bzero(&ioc, sizeof (ioc));
2N/A    ioc.si_buffer = (uint64_t)(uintptr_t)path;
2N/A    ioc.si_length = sizeof (path);
2N/A
2N/A    if (ioctl(fd, SHADOW_IOC_GETPATH, &ioc) == 0 && ioc.si_processed) {
2N/A        path[PATH_MAX - 1] = '\0';
2N/A        if (shadow_error_record(shp, path, err) != 0) {
2N/A            (void) close(fd);
2N/A            return (-1);
2N/A        }
2N/A    }
2N/A
2N/A    (void) close(fd);
2N/A    return (0);
2N/A}
2N/A
2N/A/*
2N/A * Internal function to calculate priority within the pending queue.  This is
2N/A * based primarily on the the directory depth, as we want to proceed
2N/A * depth-first in order to minimize the size of our pending list.  We also bias
2N/A * towards the most recently accessed entries, under the assumption that they
2N/A * are more likely to be accessed again.
2N/A */
2N/Auint64_t
2N/Ashadow_priority(const void *data)
2N/A{
2N/A    uint64_t depth, priority;
2N/A    const shadow_entry_t *sep = data;
2N/A
2N/A    /*
2N/A     * We have only 64 bits of identifiers, and a complete timestamp could
2N/A     * potentially take up this entire value.  Instead, we carve 16 high
2N/A     * order bits for the depth, and then squeeze the timestamp into the
2N/A     * remaining bits.  This may lose some nanosecond accuracy, but this
2N/A     * won't make a significant difference in the overall functioning of
2N/A     * the algorithm.
2N/A     */
2N/A    depth = MIN(sep->se_depth, 0xFFFF);
2N/A
2N/A    priority = (uint64_t)sep->se_timestamp.tv_sec * NANOSEC +
2N/A        sep->se_timestamp.tv_nsec;
2N/A    priority = (priority >> 16) | (depth << 48);
2N/A
2N/A    /*
2N/A     * At this point the highest value represents the highest priority, but
2N/A     * priority queues are based on the lowest value being the highest
2N/A     * priority.  We invert the value here to achieve this.
2N/A     */
2N/A    return (~priority);
2N/A}
2N/A
2N/A/*
2N/A * The actual migration is done through the SHADOW_IOC_MIGRATE ioctl().
2N/A * Normally, all migration errors are converted into the generic EIO error so
2N/A * as not to confuse consumers.  For data reporting purposes, however, we want
2N/A * to get the real error.
2N/A */
2N/Astatic int
2N/Ashadow_migrate_fd(int fd, uint64_t *size)
2N/A{
2N/A    shadow_ioc_t ioc;
2N/A
2N/A    bzero(&ioc, sizeof (ioc));
2N/A
2N/A    if (ioctl(fd, SHADOW_IOC_MIGRATE, &ioc) != 0)
2N/A        return (errno);
2N/A
2N/A    if (size != NULL)
2N/A        *size = ioc.si_size;
2N/A
2N/A    return (ioc.si_error);
2N/A}
2N/A
2N/A/*
2N/A * Migrate a directory.
2N/A */
2N/Astatic int
2N/Ashadow_migrate_dir(shadow_handle_t *shp, shadow_entry_t *sep, int *errp)
2N/A{
2N/A    DIR *dirp;
2N/A    struct dirent *dp;
2N/A    struct stat64 statbuf;
2N/A    int fd;
2N/A    shadow_type_t type;
2N/A    uint64_t subdirs, size;
2N/A
2N/A    if ((fd = open(sep->se_path, O_RDONLY)) < 0)
2N/A        return (0);
2N/A
2N/A    if ((*errp = shadow_migrate_fd(fd, &size)) != 0) {
2N/A        (void) close(fd);
2N/A        return (0);
2N/A    }
2N/A
2N/A    if ((dirp = fdopendir(fd)) == NULL) {
2N/A        (void) close(fd);
2N/A        return (0);
2N/A    }
2N/A
2N/A    subdirs = 0;
2N/A    errno = 0;
2N/A    while ((dp = readdir(dirp)) != NULL) {
2N/A        if (strcmp(dp->d_name, ".") == 0 ||
2N/A            strcmp(dp->d_name, "..") == 0)
2N/A            continue;
2N/A
2N/A        if (strcmp(sep->se_path, shp->sh_mountpoint) == 0) {
2N/A            /*
2N/A             * Skip the .SUNWshadow private directory.
2N/A             */
2N/A            if (strcmp(dp->d_name, VFS_SHADOW_PRIVATE_DIR) == 0)
2N/A                continue;
2N/A
2N/A            /*
2N/A             * Skip .zfs if this is a ZFS filesystem and it's
2N/A             * visible.
2N/A             */
2N/A            if (shp->sh_dataset != NULL &&
2N/A                strcmp(dp->d_name, ".zfs") == 0)
2N/A                continue;
2N/A        }
2N/A
2N/A        if (fstatat64(fd, dp->d_name, &statbuf,
2N/A            AT_SYMLINK_NOFOLLOW) != 0) {
2N/A            errno = 0;
2N/A            continue;
2N/A        }
2N/A
2N/A        if (S_ISREG(statbuf.st_mode)) {
2N/A            type = SHADOW_TYPE_FILE;
2N/A        } else if (S_ISDIR(statbuf.st_mode)) {
2N/A            type = SHADOW_TYPE_DIR;
2N/A            subdirs++;
2N/A        } else {
2N/A            continue;
2N/A        }
2N/A
2N/A        if (shadow_add_entry(shp, sep->se_path, dp->d_name,
2N/A            type, sep->se_depth + 1, &statbuf) != 0) {
2N/A            (void) closedir(dirp);
2N/A            return (-1);
2N/A        }
2N/A    }
2N/A
2N/A    (void) closedir(dirp);
2N/A
2N/A    if (errno == 0) {
2N/A        (void) pthread_mutex_lock(&shp->sh_lock);
2N/A        shadow_status_update(shp, sep, size, subdirs);
2N/A        (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A    }
2N/A
2N/A    return (0);
2N/A}
2N/A
2N/A/*
2N/A * Migrate a file.
2N/A */
2N/A/*ARGSUSED*/
2N/Astatic int
2N/Ashadow_migrate_file(shadow_handle_t *shp, shadow_entry_t *sep, int *errp)
2N/A{
2N/A    int fd;
2N/A    uint64_t size;
2N/A
2N/A    if ((fd = open64(sep->se_path, O_RDONLY)) < 0)
2N/A        return (0);
2N/A
2N/A    if ((*errp = shadow_migrate_fd(fd, &size)) != 0) {
2N/A        (void) close(fd);
2N/A        return (0);
2N/A    }
2N/A
2N/A    (void) close(fd);
2N/A
2N/A    (void) pthread_mutex_lock(&shp->sh_lock);
2N/A    shadow_status_update(shp, sep, size, 0);
2N/A    (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A    return (0);
2N/A}
2N/A
2N/A/*
2N/A * This function processes one entry from the on-disk pending list.  This
2N/A * function can fail with ESHADOW_MIGRATE_DONE if there are no entries left to
2N/A * process.  This is called with the lock held.
2N/A */
2N/Astatic int
2N/Ashadow_process_pending(shadow_handle_t *shp)
2N/A{
2N/A    shadow_ioc_t ioc;
2N/A    int fd;
2N/A    char path[PATH_MAX];
2N/A
2N/A    if (shp->sh_complete)
2N/A        return (shadow_set_errno(ESHADOW_MIGRATE_DONE));
2N/A
2N/A    shp->sh_active++;
2N/A    (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A    /*
2N/A     * This should never fail, but if it does just ignore the error and let
2N/A     * the client try again.
2N/A     */
2N/A    if ((fd = open(shp->sh_mountpoint, O_RDONLY)) < 0)
2N/A        goto error;
2N/A
2N/A    bzero(&ioc, sizeof (ioc));
2N/A    ioc.si_buffer = (uint64_t)(uintptr_t)path;
2N/A    ioc.si_length = sizeof (path);
2N/A    if (ioctl(fd, SHADOW_IOC_PROCESS, &ioc) != 0) {
2N/A        (void) close(fd);
2N/A        goto error;
2N/A    }
2N/A    (void) close(fd);
2N/A
2N/A    (void) pthread_mutex_lock(&shp->sh_lock);
2N/A    shp->sh_active--;
2N/A    shp->sh_onlyerrors = (boolean_t)ioc.si_onlyerrors;
2N/A    if (!ioc.si_processed) {
2N/A        shp->sh_complete = B_TRUE;
2N/A        return (shadow_set_errno(ESHADOW_MIGRATE_DONE));
2N/A    } else if (!ioc.si_error) {
2N/A        shadow_status_update(shp, NULL, ioc.si_size, 0);
2N/A        return (0);
2N/A    } else if (ioc.si_error == EINTR) {
2N/A        return (shadow_set_errno(ESHADOW_MIGRATE_INTR));
2N/A    } else {
2N/A        path[PATH_MAX - 1] = '\0';
2N/A        return (shadow_error_record(shp, path, ioc.si_error));
2N/A    }
2N/A
2N/Aerror:
2N/A    (void) pthread_mutex_lock(&shp->sh_lock);
2N/A    shp->sh_active--;
2N/A    return (shadow_error(ESHADOW_CORRUPT,
2N/A        dgettext(TEXT_DOMAIN, "unable to process pending list")));
2N/A}
2N/A
2N/Atypedef struct shadow_cleanup_arg {
2N/A    shadow_handle_t     *sca_hdl;
2N/A    shadow_entry_t      *sca_entry;
2N/A    boolean_t       sca_cleanup;
2N/A} shadow_cleanup_arg_t;
2N/A
2N/Astatic void
2N/Ashadow_migrate_cleanup(void *arg)
2N/A{
2N/A    shadow_cleanup_arg_t *sca = arg;
2N/A    shadow_handle_t *shp = sca->sca_hdl;
2N/A    shadow_entry_t *sep = sca->sca_entry;
2N/A
2N/A    (void) pthread_mutex_lock(&shp->sh_lock);
2N/A    if (sca->sca_cleanup) {
2N/A        /*
2N/A         * If the enqueue itself fails, we'll still be safe because of
2N/A         * the on-disk pending list.  This can theoretically stomp on
2N/A         * the previous error, but the only way either operation can
2N/A         * fail is with ENOMEM.
2N/A         */
2N/A        if (shadow_pq_enqueue(&shp->sh_queue, sep) == 0)
2N/A            shadow_status_enqueue(shp, sep);
2N/A    }
2N/A    shp->sh_active--;
2N/A    (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A}
2N/A
2N/A/*
2N/A * Primary entry point for migrating a file or directory.  The caller is
2N/A * responsible for controlling how often this function is called and by how
2N/A * many threads.  This pulls an entry of the pending list, and processes it
2N/A * appropriately.
2N/A *
2N/A * This function can return ESHADOW_MIGRATE_BUSY if all possible threads are
2N/A * busy processing data, or ESHADOW_MIGRATE_DONE if the filesystem is done
2N/A * being migrated.
2N/A */
2N/Aint
2N/Ashadow_migrate_one(shadow_handle_t *shp)
2N/A{
2N/A    shadow_entry_t *sep;
2N/A    int ret, err;
2N/A    struct timespec ts;
2N/A    shadow_cleanup_arg_t arg;
2N/A
2N/A    if (shadow_check_begin(shp) != 0)
2N/A        return (-1);
2N/A
2N/A    sep = shadow_pq_dequeue(&shp->sh_queue);
2N/A    if (sep == NULL) {
2N/A        if (shp->sh_active != 0) {
2N/A            (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A            return (shadow_error(ESHADOW_MIGRATE_BUSY,
2N/A                dgettext(TEXT_DOMAIN,
2N/A                "all entries are actively being processed")));
2N/A        } else {
2N/A            ret = shadow_process_pending(shp);
2N/A            (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A            return (ret);
2N/A        }
2N/A    }
2N/A    shp->sh_active++;
2N/A    (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A    arg.sca_hdl = shp;
2N/A    arg.sca_entry = sep;
2N/A    arg.sca_cleanup = B_TRUE;
2N/A    pthread_cleanup_push(shadow_migrate_cleanup, &arg);
2N/A
2N/A    /*
2N/A     * Debugging tool to allow simulation of ESHADOW_MIGRATE_BUSY.  The
2N/A     * delay is specified in milliseconds.
2N/A     */
2N/A    if (shp->sh_delay != 0) {
2N/A        ts.tv_sec = shp->sh_delay / 1000;
2N/A        ts.tv_sec = (shp->sh_delay % 1000) * 1000 * 1000;
2N/A
2N/A        (void) nanosleep(&ts, NULL);
2N/A    }
2N/A
2N/A    err = 0;
2N/A    switch (sep->se_type) {
2N/A    case SHADOW_TYPE_DIR:
2N/A        ret = shadow_migrate_dir(shp, sep, &err);
2N/A        break;
2N/A
2N/A    case SHADOW_TYPE_FILE:
2N/A        ret = shadow_migrate_file(shp, sep, &err);
2N/A        break;
2N/A
2N/A    default:
2N/A        assert(0);
2N/A    }
2N/A
2N/A    (void) pthread_mutex_lock(&shp->sh_lock);
2N/A
2N/A    if (err == EWOULDBLOCK) {
2N/A        /*
2N/A         * This indicates that the filesystem is mounted in standby
2N/A         * mode.  If this is the case, return an error, which will
2N/A         * cause the consumer to retry at a later point (or move onto
2N/A         * other filesystems).
2N/A         */
2N/A        (void) shadow_pq_enqueue(&shp->sh_queue, sep);
2N/A        (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A        return (shadow_error(ESHADOW_STANDBY,
2N/A            dgettext(TEXT_DOMAIN,
2N/A            "filesystem currently mounted in standby mode")));
2N/A    }
2N/A
2N/A    shadow_status_dequeue(shp, sep);
2N/A    (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A    /*
2N/A     * The above functions can only fail if there is a library error (such
2N/A     * as out-of-memory conditions).  In this case we should put it back in
2N/A     * our queue.  If there was an I/O error or kernel level problem, we'll
2N/A     * rely on the shadow pending queue to pick up the file later as part
2N/A     * of the cleanup phase.  The exception is EINTR, where we know we
2N/A     * should retry the migration.
2N/A     */
2N/A    if (ret == 0) {
2N/A        ret = shadow_error_check(shp, sep->se_path, err);
2N/A
2N/A        if (err != EINTR) {
2N/A            arg.sca_cleanup = B_FALSE;
2N/A            free(sep->se_path);
2N/A            free(sep);
2N/A        }
2N/A    }
2N/A
2N/A    pthread_cleanup_pop(B_TRUE);
2N/A
2N/A    return (ret);
2N/A}
2N/A
2N/A/*
2N/A * Returns true if this filesystem has finished being migrated.
2N/A */
2N/Aboolean_t
2N/Ashadow_migrate_done(shadow_handle_t *shp)
2N/A{
2N/A    return (shp->sh_complete);
2N/A}
2N/A
2N/A/*
2N/A * Returns true if there are only files with persistent errors left to migrate.
2N/A * These errors may still be fixed by the user, so consumers should use this
2N/A * information to process entries less aggressively.
2N/A */
2N/Aboolean_t
2N/Ashadow_migrate_only_errors(shadow_handle_t *shp)
2N/A{
2N/A    return (shp->sh_onlyerrors);
2N/A}
2N/A
2N/A/*
2N/A * This is a debugging tool that allows applications to dump out the current
2N/A * pending list or otherwise manipulate it.  Because it's only for debugging
2N/A * purposes, it can leave the pending list in an arbitrary invalid state is
2N/A * something fails (i.e. memory allocation).
2N/A */
2N/Aint
2N/Ashadow_migrate_iter(shadow_handle_t *shp, void (*func)(const char *, void *),
2N/A    void *data)
2N/A{
2N/A    shadow_entry_t *sep;
2N/A    shadow_pq_t copy;
2N/A
2N/A    if (shadow_check_begin(shp) != 0)
2N/A        return (-1);
2N/A
2N/A    if (shadow_pq_init(&copy, shadow_priority) != 0) {
2N/A        (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A        return (-1);
2N/A    }
2N/A
2N/A    while ((sep = shadow_pq_dequeue(&shp->sh_queue)) != NULL) {
2N/A        if (shadow_pq_enqueue(&copy, sep) != 0) {
2N/A            free(sep->se_path);
2N/A            free(sep);
2N/A            goto error;
2N/A        }
2N/A
2N/A        func(sep->se_path, data);
2N/A    }
2N/A
2N/Aerror:
2N/A    while ((sep = shadow_pq_dequeue(&copy)) != NULL) {
2N/A        if (shadow_pq_enqueue(&shp->sh_queue, sep) != 0) {
2N/A            free(sep->se_path);
2N/A            free(sep);
2N/A        }
2N/A    }
2N/A    shadow_pq_fini(&copy);
2N/A    (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A    return (0);
2N/A}
2N/A
2N/A/*
2N/A * Cleanup after a completed shadow migration.  This is identical to
2N/A * shadow_cancel() except that it verifies that the migration is complete.
2N/A */
2N/Aint
2N/Ashadow_migrate_finalize(shadow_handle_t *shp)
2N/A{
2N/A    if (!shadow_migrate_done(shp))
2N/A        return (shadow_error(ESHADOW_MIGRATE_BUSY,
2N/A            dgettext(TEXT_DOMAIN, "migration is not complete")));
2N/A
2N/A    return (shadow_cancel(shp));
2N/A}
2N/A
2N/A/*
2N/A * This is a debugging-only tool that makes it easier to simulate
2N/A * ESHADOW_MIGRATE_BUSY by suspending shadow_migrate_one() before migrating the
2N/A * file or directory.  This should not be used by production software - if
2N/A * there needs to be throttling done, it should be implemented by the caller
2N/A * invoking shadow_migrate_one() on a less frequent basis.  The delay is
2N/A * specified in milliseconds.
2N/A */
2N/Avoid
2N/Ashadow_migrate_delay(shadow_handle_t *shp, uint32_t delay)
2N/A{
2N/A    shp->sh_delay = delay;
2N/A}