2N/A/*
2N/A * CDDL HEADER START
2N/A *
2N/A * The contents of this file are subject to the terms of the
2N/A * Common Development and Distribution License (the "License").
2N/A * You may not use this file except in compliance with the License.
2N/A *
2N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A * or http://www.opensolaris.org/os/licensing.
2N/A * See the License for the specific language governing permissions
2N/A * and limitations under the License.
2N/A *
2N/A * When distributing Covered Code, include this CDDL HEADER in each
2N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A * If applicable, add the following below this CDDL HEADER, with the
2N/A * fields enclosed by brackets "[]" replaced with your own identifying
2N/A * information: Portions Copyright [yyyy] [name of copyright owner]
2N/A *
2N/A * CDDL HEADER END
2N/A */
2N/A/*
2N/A * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
2N/A */
2N/A
2N/A/*
2N/A * This file contains the infrastructure to migrate files and directories.
2N/A */
2N/A
2N/A#include "shadow_impl.h"
2N/A
2N/Astatic size_t shadow_fid_load_max = 10000;
2N/A
2N/Astatic int
2N/Ashadow_add_entry(shadow_handle_t *shp, const char *path, const char *entry,
2N/A shadow_type_t type, uint32_t depth, struct stat64 *statbuf)
2N/A{
2N/A shadow_entry_t *sep;
2N/A size_t len;
2N/A struct statvfs64 vstat;
2N/A
2N/A if ((sep = shadow_zalloc(sizeof (shadow_entry_t))) == NULL)
2N/A return (-1);
2N/A
2N/A if (entry == NULL) {
2N/A if ((sep->se_path = shadow_strdup(path)) == NULL) {
2N/A free(sep);
2N/A return (-1);
2N/A }
2N/A } else {
2N/A len = strlen(path) + strlen(entry) + 2;
2N/A if ((sep->se_path = shadow_alloc(len)) == NULL) {
2N/A free(sep);
2N/A return (-1);
2N/A }
2N/A
2N/A (void) snprintf(sep->se_path, len, "%s/%s", path, entry);
2N/A }
2N/A
2N/A /*
2N/A * If this directory is part of a different filesystem, then stop the
2N/A * traversal rather than wasting time traversing the subdirectory. The
2N/A * implementation of 'f_fsid' leaves something to be desired, but since
2N/A * this is just a suggestion, it's harmless if we're wrong.
2N/A */
2N/A if (shp->sh_fsid != NODEV &&
2N/A statvfs64(sep->se_path, &vstat) == 0 &&
2N/A vstat.f_fsid != shp->sh_fsid) {
2N/A free(sep->se_path);
2N/A free(sep);
2N/A return (0);
2N/A }
2N/A
2N/A if (statbuf != NULL)
2N/A sep->se_timestamp = statbuf->st_atim;
2N/A sep->se_depth = depth;
2N/A sep->se_type = type;
2N/A
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A if (shadow_pq_enqueue(&shp->sh_queue, sep) != 0) {
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A free(sep->se_path);
2N/A free(sep);
2N/A return (-1);
2N/A }
2N/A shadow_status_enqueue(shp, sep);
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A return (0);
2N/A}
2N/A
2N/Astatic fid_t *
2N/Ashadow_read_fidlist(const char *root, int idx, size_t *count)
2N/A{
2N/A int fd;
2N/A size_t retlen;
2N/A struct stat64 statbuf;
2N/A vfs_shadow_header_t header;
2N/A fid_t *ret;
2N/A size_t fidlen = sizeof (ret->un._fid);
2N/A int i;
2N/A char path[PATH_MAX];
2N/A
2N/A (void) snprintf(path, sizeof (path), "%s/%s/%s/%d", root,
2N/A VFS_SHADOW_PRIVATE_DIR, VFS_SHADOW_PRIVATE_PENDING,
2N/A idx);
2N/A
2N/A if ((fd = open(path, O_RDONLY)) < 0)
2N/A return (NULL);
2N/A
2N/A if (fstat64(fd, &statbuf) != 0) {
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A
2N/A if (statbuf.st_size < sizeof (vfs_shadow_header_t)) {
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A
2N/A if (read(fd, &header, sizeof (header)) < sizeof (header)) {
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A
2N/A if (header.vsh_magic != VFS_SHADOW_ATTR_LIST_MAGIC ||
2N/A header.vsh_version != VFS_SHADOW_INTENT_VERSION) {
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A
2N/A /* XXX verify endianness */
2N/A
2N/A retlen = statbuf.st_size - sizeof (header);
2N/A if (retlen % fidlen != 0) {
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A
2N/A *count = retlen / fidlen;
2N/A
2N/A /*
2N/A * If the size of the pending lists exceeds a reasonable size, then
2N/A * bail out. While we try to keep the FID lists short, there are times
2N/A * (such as when there are a large number of errors) when the lists
2N/A * grow very large. If this is the case, then it's probably not worth
2N/A * trying to load and resume the migration from this list, and we're
2N/A * better off just loading the root directory and starting from
2N/A * scratch.
2N/A */
2N/A if (*count > shadow_fid_load_max) {
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A
2N/A if ((ret = shadow_alloc(*count * sizeof (fid_t))) == NULL) {
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A
2N/A for (i = 0; i < *count; i++) {
2N/A if (pread64(fd, ret + i, fidlen,
2N/A sizeof (header) + fidlen * i) != fidlen) {
2N/A free(ret);
2N/A (void) close(fd);
2N/A return (NULL);
2N/A }
2N/A }
2N/A
2N/A (void) close(fd);
2N/A return (ret);
2N/A}
2N/A
2N/Atypedef struct shadow_fid_entry {
2N/A fid_t sfe_fid;
2N/A shadow_hash_link_t sfe_link;
2N/A} shadow_fid_entry_t;
2N/A
2N/Astatic const void *
2N/Ashadow_fid_hash_convert(const void *p)
2N/A{
2N/A const shadow_fid_entry_t *fep = p;
2N/A
2N/A return (&fep->sfe_fid);
2N/A}
2N/A
2N/Astatic ulong_t
2N/Ashadow_fid_hash_compute(const void *p)
2N/A{
2N/A const fid_t *fidp = p;
2N/A ulong_t hash = 0;
2N/A int i;
2N/A
2N/A for (i = 0; i < fidp->fid_len; i++)
2N/A hash += fidp->fid_data[i];
2N/A
2N/A return (hash);
2N/A}
2N/A
2N/Astatic int
2N/Ashadow_fid_hash_compare(const void *a, const void *b)
2N/A{
2N/A const fid_t *fa = a;
2N/A const fid_t *fb = b;
2N/A
2N/A if (fa->fid_len != fb->fid_len)
2N/A return (-1);
2N/A
2N/A return (bcmp(fa->fid_data, fb->fid_data, fa->fid_len));
2N/A}
2N/A
2N/A/*
2N/A * Iterate over the given pending FID list and add entries for each item in the
2N/A * list.
2N/A */
2N/Astatic int
2N/Ashadow_load_fidlist(shadow_handle_t *shp, shadow_hash_t *seen, int idx)
2N/A{
2N/A fid_t *fids;
2N/A size_t i, count, depth;
2N/A char *buf, *newbuf, *slash;
2N/A size_t buflen, mountlen, mountptlen;
2N/A shadow_ioc_t ioc;
2N/A int fd;
2N/A struct stat64 statbuf;
2N/A shadow_type_t type;
2N/A int ret = -1;
2N/A shadow_fid_entry_t *fep;
2N/A
2N/A if ((fids = shadow_read_fidlist(shp->sh_mountpoint, idx,
2N/A &count)) == NULL)
2N/A return (0);
2N/A
2N/A if ((fd = open(shp->sh_mountpoint, O_RDONLY)) < 0) {
2N/A free(fids);
2N/A return (0);
2N/A }
2N/A
2N/A if ((buf = shadow_alloc(PATH_MAX)) == NULL) {
2N/A free(fids);
2N/A (void) close(fd);
2N/A return (-1);
2N/A }
2N/A
2N/A mountptlen = strlen(shp->sh_mountpoint);
2N/A
2N/A buflen = PATH_MAX;
2N/A ioc.si_buffer = (uint64_t)(uintptr_t)buf;
2N/A ioc.si_length = buflen;
2N/A
2N/A mountlen = strlen(shp->sh_mountpoint);
2N/A
2N/A for (i = 0; i < count; i++) {
2N/A ioc.si_fid = fids[i];
2N/A
2N/A if (ioc.si_fid.fid_len > MAXFIDSZ)
2N/A continue;
2N/A
2N/A if (ioctl(fd, SHADOW_IOC_FID2PATH, &ioc) != 0) {
2N/A if (errno == ENOMEM) {
2N/A if ((newbuf = shadow_alloc(
2N/A buflen * 2)) == NULL) {
2N/A goto error;
2N/A }
2N/A
2N/A free(buf);
2N/A buf = newbuf;
2N/A buflen *= 2;
2N/A i--;
2N/A }
2N/A continue;
2N/A }
2N/A
2N/A if (buf[0] == '\0')
2N/A continue;
2N/A
2N/A /*
2N/A * With two pending lists and the abilty for entries to appear
2N/A * multiple times in a pending list, we want to make sure we
2N/A * don't add the same entry twice. For efficiency, we create a
2N/A * hash based on FID and ignore those we've already seen.
2N/A * Ideally, we'd like to avoid adding children if we've already
2N/A * added a parent (which would visit the same child twice), but
2N/A * this requires a more complicated data structure and should
2N/A * hopefully be a rare occurrence.
2N/A */
2N/A if (shadow_hash_lookup(seen, &ioc.si_fid) != NULL)
2N/A continue;
2N/A
2N/A if ((fep = shadow_alloc(sizeof (shadow_fid_entry_t))) == NULL)
2N/A goto error;
2N/A
2N/A fep->sfe_fid = ioc.si_fid;
2N/A shadow_hash_insert(seen, fep);
2N/A
2N/A /*
2N/A * If this is a relative path, it is the remote path and we
2N/A * should turn it into a guess at the absolute path.
2N/A */
2N/A if (buf[0] != '/') {
2N/A if (strlen(shp->sh_mountpoint) +
2N/A strlen(buf) + 2 > buflen) {
2N/A if ((newbuf = shadow_alloc(
2N/A buflen * 2)) == NULL)
2N/A goto error;
2N/A
2N/A free(buf);
2N/A buf = newbuf;
2N/A buflen *= 2;
2N/A i--;
2N/A continue;
2N/A }
2N/A
2N/A (void) memmove(buf + mountlen + 1, buf,
2N/A strlen(buf) + 1);
2N/A (void) memcpy(buf, shp->sh_mountpoint, mountlen);
2N/A buf[mountlen] = '/';
2N/A }
2N/A
2N/A if (strncmp(buf, shp->sh_mountpoint, mountlen) != 0)
2N/A continue;
2N/A
2N/A /*
2N/A * When we first start migration, we have the root directory
2N/A * and its contents in the pending list. As a special case to
2N/A * avoid looking at the entire hierarchy twice, we never add
2N/A * the root directory to the pending list. If there is some
2N/A * error that is keeping the root directory from being
2N/A * migrated, we'll discover it when we process the pending
2N/A * list.
2N/A */
2N/A if (buf[mountptlen] == '\0')
2N/A continue;
2N/A
2N/A if (stat64(buf, &statbuf) != 0)
2N/A continue;
2N/A
2N/A if (S_ISREG(statbuf.st_mode)) {
2N/A type = SHADOW_TYPE_FILE;
2N/A } else if (S_ISDIR(statbuf.st_mode)) {
2N/A type = SHADOW_TYPE_DIR;
2N/A } else {
2N/A continue;
2N/A }
2N/A
2N/A depth = 0;
2N/A slash = buf + mountlen - 1;
2N/A while ((slash = strchr(slash + 1, '/')) != NULL)
2N/A depth++;
2N/A
2N/A if (shadow_add_entry(shp, buf, NULL, type, depth,
2N/A &statbuf) != 0)
2N/A goto error;
2N/A }
2N/A
2N/A ret = 0;
2N/Aerror:
2N/A
2N/A free(fids);
2N/A free(buf);
2N/A (void) close(fd);
2N/A return (ret);
2N/A}
2N/A
2N/A/*
2N/A * This function is responsible for adding the initial directories to the list.
2N/A * In order to allow us to resume a previous migration, we make the assumption
2N/A * that the filesystem is largely static, and the remote paths are likely the
2N/A * same as the local ones. By this token, we can iterate over the pending list
2N/A * and lookup the remote path for those FIDs that are not yet migrated. As an
2N/A * extra check, we also look at the vnode path information as a second source
2N/A * of possible path information. If everything fails, then we fall back to
2N/A * processing the FID list individually. While not ideal, it gets the job
2N/A * done. This is done asynchronously to the open, when the first migration is
2N/A * attempted. Because we don't want to block reading the FID list when mounted
2N/A * in standby mode, we return an error if we're currently in standby mode.
2N/A */
2N/Astatic int
2N/Ashadow_begin(shadow_handle_t *shp)
2N/A{
2N/A FILE *mnttab = NULL;
2N/A struct mnttab mntent, search;
2N/A char *mntopt;
2N/A shadow_hash_t *seen;
2N/A shadow_fid_entry_t *fep;
2N/A int ret;
2N/A
2N/A if ((mnttab = fopen(MNTTAB, "r")) == NULL)
2N/A return (shadow_error(ESHADOW_NOMOUNT,
2N/A dgettext(TEXT_DOMAIN, "failed to open /etc/mnttab")));
2N/A
2N/A bzero(&search, sizeof (search));
2N/A search.mnt_mountp = shp->sh_mountpoint;
2N/A if (getmntany(mnttab, &mntent, &search) != 0) {
2N/A /* shouldn't happen */
2N/A (void) fclose(mnttab);
2N/A return (shadow_error(ESHADOW_NOMOUNT,
2N/A dgettext(TEXT_DOMAIN, "no such mountpoint %s"),
2N/A shp->sh_mountpoint));
2N/A }
2N/A
2N/A if ((mntopt = hasmntopt(&mntent, "shadow")) != NULL &&
2N/A strncmp(mntopt, "shadow=standby", 14) == 0) {
2N/A (void) fclose(mnttab);
2N/A return (shadow_error(ESHADOW_STANDBY,
2N/A dgettext(TEXT_DOMAIN, "filesystem currently in standby")));
2N/A }
2N/A
2N/A (void) fclose(mnttab);
2N/A
2N/A if ((seen = shadow_hash_create(offsetof(shadow_fid_entry_t, sfe_link),
2N/A shadow_fid_hash_convert, shadow_fid_hash_compute,
2N/A shadow_fid_hash_compare)) == NULL)
2N/A return (-1);
2N/A
2N/A ret = 0;
2N/A if (shadow_load_fidlist(shp, seen, 0) != 0 ||
2N/A shadow_load_fidlist(shp, seen, 1) != 0)
2N/A ret = -1;
2N/A
2N/A while ((fep = shadow_hash_first(seen)) != NULL) {
2N/A shadow_hash_remove(seen, fep);
2N/A free(fep);
2N/A }
2N/A shadow_hash_destroy(seen);
2N/A
2N/A if (shadow_pq_peek(&shp->sh_queue) == NULL)
2N/A (void) shadow_add_entry(shp, shp->sh_mountpoint, NULL,
2N/A SHADOW_TYPE_DIR, 0, NULL);
2N/A
2N/A return (ret);
2N/A}
2N/A
2N/A/*
2N/A * This function will go and load the pending FID list, if necessary. It
2N/A * returns with the sh_lock held on success
2N/A */
2N/Astatic int
2N/Ashadow_check_begin(shadow_handle_t *shp)
2N/A{
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A if (!shp->sh_loaded) {
2N/A if (shp->sh_loading) {
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A return (shadow_error(ESHADOW_MIGRATE_BUSY,
2N/A dgettext(TEXT_DOMAIN,
2N/A "pending FID list is currently being loaded")));
2N/A }
2N/A
2N/A shp->sh_loading = B_TRUE;
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A if (shadow_begin(shp) != 0) {
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A shp->sh_loading = B_FALSE;
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A return (-1);
2N/A }
2N/A
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A }
2N/A
2N/A shp->sh_loaded = B_TRUE;
2N/A shp->sh_loading = B_FALSE;
2N/A
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * This function is called during shadow_close() and is responsible for
2N/A * removing all items from the work queue and freeing up any errors seen.
2N/A */
2N/Avoid
2N/Ashadow_end(shadow_handle_t *shp)
2N/A{
2N/A shadow_entry_t *sep;
2N/A shadow_error_t *srp;
2N/A
2N/A while ((sep = shadow_pq_dequeue(&shp->sh_queue)) != NULL) {
2N/A free(sep->se_path);
2N/A free(sep);
2N/A }
2N/A
2N/A while ((srp = shp->sh_errors) != NULL) {
2N/A shp->sh_errors = srp->se_next;
2N/A free(srp);
2N/A }
2N/A}
2N/A
2N/A/*
2N/A * Record an error against the given path. We first check to see if it's a
2N/A * known error, returning if it is. Otherwise, we create an entry in the error
2N/A * list and record the relevant information.
2N/A */
2N/Astatic int
2N/Ashadow_error_record(shadow_handle_t *shp, const char *path, int err)
2N/A{
2N/A shadow_error_t *sep;
2N/A
2N/A (void) pthread_mutex_lock(&shp->sh_errlock);
2N/A for (sep = shp->sh_errors; sep != NULL; sep = sep->se_next) {
2N/A if (strcmp(sep->se_path, path) == 0) {
2N/A sep->se_error = err;
2N/A break;
2N/A }
2N/A }
2N/A
2N/A if (sep == NULL) {
2N/A if ((sep = shadow_zalloc(sizeof (shadow_error_t))) == NULL) {
2N/A (void) pthread_mutex_unlock(&shp->sh_errlock);
2N/A return (-1);
2N/A }
2N/A
2N/A if ((sep->se_path = shadow_strdup(path)) == NULL) {
2N/A (void) pthread_mutex_unlock(&shp->sh_errlock);
2N/A free(sep);
2N/A return (-1);
2N/A }
2N/A
2N/A sep->se_error = err;
2N/A sep->se_next = shp->sh_errors;
2N/A shp->sh_errors = sep;
2N/A shp->sh_errcount++;
2N/A }
2N/A
2N/A (void) pthread_mutex_unlock(&shp->sh_errlock);
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * Called when migration fails for a file or directory. In this case, we
2N/A * consult the kernel to get the remote path for the object. If this fails,
2N/A * then we assume it's a local error and don't record the failure. If it
2N/A * succeeds, it indicates there was a problem with the remote side, and we do
2N/A * record the error.
2N/A */
2N/Astatic int
2N/Ashadow_error_check(shadow_handle_t *shp, const char *localpath, int err)
2N/A{
2N/A char path[PATH_MAX];
2N/A shadow_ioc_t ioc;
2N/A int fd;
2N/A
2N/A if (err == 0 || err == EINTR)
2N/A return (0);
2N/A
2N/A if ((fd = open(localpath, O_RDONLY)) < 0)
2N/A return (0);
2N/A
2N/A bzero(&ioc, sizeof (ioc));
2N/A ioc.si_buffer = (uint64_t)(uintptr_t)path;
2N/A ioc.si_length = sizeof (path);
2N/A
2N/A if (ioctl(fd, SHADOW_IOC_GETPATH, &ioc) == 0 && ioc.si_processed) {
2N/A path[PATH_MAX - 1] = '\0';
2N/A if (shadow_error_record(shp, path, err) != 0) {
2N/A (void) close(fd);
2N/A return (-1);
2N/A }
2N/A }
2N/A
2N/A (void) close(fd);
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * Internal function to calculate priority within the pending queue. This is
2N/A * based primarily on the the directory depth, as we want to proceed
2N/A * depth-first in order to minimize the size of our pending list. We also bias
2N/A * towards the most recently accessed entries, under the assumption that they
2N/A * are more likely to be accessed again.
2N/A */
2N/Auint64_t
2N/Ashadow_priority(const void *data)
2N/A{
2N/A uint64_t depth, priority;
2N/A const shadow_entry_t *sep = data;
2N/A
2N/A /*
2N/A * We have only 64 bits of identifiers, and a complete timestamp could
2N/A * potentially take up this entire value. Instead, we carve 16 high
2N/A * order bits for the depth, and then squeeze the timestamp into the
2N/A * remaining bits. This may lose some nanosecond accuracy, but this
2N/A * won't make a significant difference in the overall functioning of
2N/A * the algorithm.
2N/A */
2N/A depth = MIN(sep->se_depth, 0xFFFF);
2N/A
2N/A priority = (uint64_t)sep->se_timestamp.tv_sec * NANOSEC +
2N/A sep->se_timestamp.tv_nsec;
2N/A priority = (priority >> 16) | (depth << 48);
2N/A
2N/A /*
2N/A * At this point the highest value represents the highest priority, but
2N/A * priority queues are based on the lowest value being the highest
2N/A * priority. We invert the value here to achieve this.
2N/A */
2N/A return (~priority);
2N/A}
2N/A
2N/A/*
2N/A * The actual migration is done through the SHADOW_IOC_MIGRATE ioctl().
2N/A * Normally, all migration errors are converted into the generic EIO error so
2N/A * as not to confuse consumers. For data reporting purposes, however, we want
2N/A * to get the real error.
2N/A */
2N/Astatic int
2N/Ashadow_migrate_fd(int fd, uint64_t *size)
2N/A{
2N/A shadow_ioc_t ioc;
2N/A
2N/A bzero(&ioc, sizeof (ioc));
2N/A
2N/A if (ioctl(fd, SHADOW_IOC_MIGRATE, &ioc) != 0)
2N/A return (errno);
2N/A
2N/A if (size != NULL)
2N/A *size = ioc.si_size;
2N/A
2N/A return (ioc.si_error);
2N/A}
2N/A
2N/A/*
2N/A * Migrate a directory.
2N/A */
2N/Astatic int
2N/Ashadow_migrate_dir(shadow_handle_t *shp, shadow_entry_t *sep, int *errp)
2N/A{
2N/A DIR *dirp;
2N/A struct dirent *dp;
2N/A struct stat64 statbuf;
2N/A int fd;
2N/A shadow_type_t type;
2N/A uint64_t subdirs, size;
2N/A
2N/A if ((fd = open(sep->se_path, O_RDONLY)) < 0)
2N/A return (0);
2N/A
2N/A if ((*errp = shadow_migrate_fd(fd, &size)) != 0) {
2N/A (void) close(fd);
2N/A return (0);
2N/A }
2N/A
2N/A if ((dirp = fdopendir(fd)) == NULL) {
2N/A (void) close(fd);
2N/A return (0);
2N/A }
2N/A
2N/A subdirs = 0;
2N/A errno = 0;
2N/A while ((dp = readdir(dirp)) != NULL) {
2N/A if (strcmp(dp->d_name, ".") == 0 ||
2N/A strcmp(dp->d_name, "..") == 0)
2N/A continue;
2N/A
2N/A if (strcmp(sep->se_path, shp->sh_mountpoint) == 0) {
2N/A /*
2N/A * Skip the .SUNWshadow private directory.
2N/A */
2N/A if (strcmp(dp->d_name, VFS_SHADOW_PRIVATE_DIR) == 0)
2N/A continue;
2N/A
2N/A /*
2N/A * Skip .zfs if this is a ZFS filesystem and it's
2N/A * visible.
2N/A */
2N/A if (shp->sh_dataset != NULL &&
2N/A strcmp(dp->d_name, ".zfs") == 0)
2N/A continue;
2N/A }
2N/A
2N/A if (fstatat64(fd, dp->d_name, &statbuf,
2N/A AT_SYMLINK_NOFOLLOW) != 0) {
2N/A errno = 0;
2N/A continue;
2N/A }
2N/A
2N/A if (S_ISREG(statbuf.st_mode)) {
2N/A type = SHADOW_TYPE_FILE;
2N/A } else if (S_ISDIR(statbuf.st_mode)) {
2N/A type = SHADOW_TYPE_DIR;
2N/A subdirs++;
2N/A } else {
2N/A continue;
2N/A }
2N/A
2N/A if (shadow_add_entry(shp, sep->se_path, dp->d_name,
2N/A type, sep->se_depth + 1, &statbuf) != 0) {
2N/A (void) closedir(dirp);
2N/A return (-1);
2N/A }
2N/A }
2N/A
2N/A (void) closedir(dirp);
2N/A
2N/A if (errno == 0) {
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A shadow_status_update(shp, sep, size, subdirs);
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A }
2N/A
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * Migrate a file.
2N/A */
2N/A/*ARGSUSED*/
2N/Astatic int
2N/Ashadow_migrate_file(shadow_handle_t *shp, shadow_entry_t *sep, int *errp)
2N/A{
2N/A int fd;
2N/A uint64_t size;
2N/A
2N/A if ((fd = open64(sep->se_path, O_RDONLY)) < 0)
2N/A return (0);
2N/A
2N/A if ((*errp = shadow_migrate_fd(fd, &size)) != 0) {
2N/A (void) close(fd);
2N/A return (0);
2N/A }
2N/A
2N/A (void) close(fd);
2N/A
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A shadow_status_update(shp, sep, size, 0);
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * This function processes one entry from the on-disk pending list. This
2N/A * function can fail with ESHADOW_MIGRATE_DONE if there are no entries left to
2N/A * process. This is called with the lock held.
2N/A */
2N/Astatic int
2N/Ashadow_process_pending(shadow_handle_t *shp)
2N/A{
2N/A shadow_ioc_t ioc;
2N/A int fd;
2N/A char path[PATH_MAX];
2N/A
2N/A if (shp->sh_complete)
2N/A return (shadow_set_errno(ESHADOW_MIGRATE_DONE));
2N/A
2N/A shp->sh_active++;
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A /*
2N/A * This should never fail, but if it does just ignore the error and let
2N/A * the client try again.
2N/A */
2N/A if ((fd = open(shp->sh_mountpoint, O_RDONLY)) < 0)
2N/A goto error;
2N/A
2N/A bzero(&ioc, sizeof (ioc));
2N/A ioc.si_buffer = (uint64_t)(uintptr_t)path;
2N/A ioc.si_length = sizeof (path);
2N/A if (ioctl(fd, SHADOW_IOC_PROCESS, &ioc) != 0) {
2N/A (void) close(fd);
2N/A goto error;
2N/A }
2N/A (void) close(fd);
2N/A
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A shp->sh_active--;
2N/A shp->sh_onlyerrors = (boolean_t)ioc.si_onlyerrors;
2N/A if (!ioc.si_processed) {
2N/A shp->sh_complete = B_TRUE;
2N/A return (shadow_set_errno(ESHADOW_MIGRATE_DONE));
2N/A } else if (!ioc.si_error) {
2N/A shadow_status_update(shp, NULL, ioc.si_size, 0);
2N/A return (0);
2N/A } else if (ioc.si_error == EINTR) {
2N/A return (shadow_set_errno(ESHADOW_MIGRATE_INTR));
2N/A } else {
2N/A path[PATH_MAX - 1] = '\0';
2N/A return (shadow_error_record(shp, path, ioc.si_error));
2N/A }
2N/A
2N/Aerror:
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A shp->sh_active--;
2N/A return (shadow_error(ESHADOW_CORRUPT,
2N/A dgettext(TEXT_DOMAIN, "unable to process pending list")));
2N/A}
2N/A
2N/Atypedef struct shadow_cleanup_arg {
2N/A shadow_handle_t *sca_hdl;
2N/A shadow_entry_t *sca_entry;
2N/A boolean_t sca_cleanup;
2N/A} shadow_cleanup_arg_t;
2N/A
2N/Astatic void
2N/Ashadow_migrate_cleanup(void *arg)
2N/A{
2N/A shadow_cleanup_arg_t *sca = arg;
2N/A shadow_handle_t *shp = sca->sca_hdl;
2N/A shadow_entry_t *sep = sca->sca_entry;
2N/A
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A if (sca->sca_cleanup) {
2N/A /*
2N/A * If the enqueue itself fails, we'll still be safe because of
2N/A * the on-disk pending list. This can theoretically stomp on
2N/A * the previous error, but the only way either operation can
2N/A * fail is with ENOMEM.
2N/A */
2N/A if (shadow_pq_enqueue(&shp->sh_queue, sep) == 0)
2N/A shadow_status_enqueue(shp, sep);
2N/A }
2N/A shp->sh_active--;
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A}
2N/A
2N/A/*
2N/A * Primary entry point for migrating a file or directory. The caller is
2N/A * responsible for controlling how often this function is called and by how
2N/A * many threads. This pulls an entry of the pending list, and processes it
2N/A * appropriately.
2N/A *
2N/A * This function can return ESHADOW_MIGRATE_BUSY if all possible threads are
2N/A * busy processing data, or ESHADOW_MIGRATE_DONE if the filesystem is done
2N/A * being migrated.
2N/A */
2N/Aint
2N/Ashadow_migrate_one(shadow_handle_t *shp)
2N/A{
2N/A shadow_entry_t *sep;
2N/A int ret, err;
2N/A struct timespec ts;
2N/A shadow_cleanup_arg_t arg;
2N/A
2N/A if (shadow_check_begin(shp) != 0)
2N/A return (-1);
2N/A
2N/A sep = shadow_pq_dequeue(&shp->sh_queue);
2N/A if (sep == NULL) {
2N/A if (shp->sh_active != 0) {
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A return (shadow_error(ESHADOW_MIGRATE_BUSY,
2N/A dgettext(TEXT_DOMAIN,
2N/A "all entries are actively being processed")));
2N/A } else {
2N/A ret = shadow_process_pending(shp);
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A return (ret);
2N/A }
2N/A }
2N/A shp->sh_active++;
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A arg.sca_hdl = shp;
2N/A arg.sca_entry = sep;
2N/A arg.sca_cleanup = B_TRUE;
2N/A pthread_cleanup_push(shadow_migrate_cleanup, &arg);
2N/A
2N/A /*
2N/A * Debugging tool to allow simulation of ESHADOW_MIGRATE_BUSY. The
2N/A * delay is specified in milliseconds.
2N/A */
2N/A if (shp->sh_delay != 0) {
2N/A ts.tv_sec = shp->sh_delay / 1000;
2N/A ts.tv_sec = (shp->sh_delay % 1000) * 1000 * 1000;
2N/A
2N/A (void) nanosleep(&ts, NULL);
2N/A }
2N/A
2N/A err = 0;
2N/A switch (sep->se_type) {
2N/A case SHADOW_TYPE_DIR:
2N/A ret = shadow_migrate_dir(shp, sep, &err);
2N/A break;
2N/A
2N/A case SHADOW_TYPE_FILE:
2N/A ret = shadow_migrate_file(shp, sep, &err);
2N/A break;
2N/A
2N/A default:
2N/A assert(0);
2N/A }
2N/A
2N/A (void) pthread_mutex_lock(&shp->sh_lock);
2N/A
2N/A if (err == EWOULDBLOCK) {
2N/A /*
2N/A * This indicates that the filesystem is mounted in standby
2N/A * mode. If this is the case, return an error, which will
2N/A * cause the consumer to retry at a later point (or move onto
2N/A * other filesystems).
2N/A */
2N/A (void) shadow_pq_enqueue(&shp->sh_queue, sep);
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A return (shadow_error(ESHADOW_STANDBY,
2N/A dgettext(TEXT_DOMAIN,
2N/A "filesystem currently mounted in standby mode")));
2N/A }
2N/A
2N/A shadow_status_dequeue(shp, sep);
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A /*
2N/A * The above functions can only fail if there is a library error (such
2N/A * as out-of-memory conditions). In this case we should put it back in
2N/A * our queue. If there was an I/O error or kernel level problem, we'll
2N/A * rely on the shadow pending queue to pick up the file later as part
2N/A * of the cleanup phase. The exception is EINTR, where we know we
2N/A * should retry the migration.
2N/A */
2N/A if (ret == 0) {
2N/A ret = shadow_error_check(shp, sep->se_path, err);
2N/A
2N/A if (err != EINTR) {
2N/A arg.sca_cleanup = B_FALSE;
2N/A free(sep->se_path);
2N/A free(sep);
2N/A }
2N/A }
2N/A
2N/A pthread_cleanup_pop(B_TRUE);
2N/A
2N/A return (ret);
2N/A}
2N/A
2N/A/*
2N/A * Returns true if this filesystem has finished being migrated.
2N/A */
2N/Aboolean_t
2N/Ashadow_migrate_done(shadow_handle_t *shp)
2N/A{
2N/A return (shp->sh_complete);
2N/A}
2N/A
2N/A/*
2N/A * Returns true if there are only files with persistent errors left to migrate.
2N/A * These errors may still be fixed by the user, so consumers should use this
2N/A * information to process entries less aggressively.
2N/A */
2N/Aboolean_t
2N/Ashadow_migrate_only_errors(shadow_handle_t *shp)
2N/A{
2N/A return (shp->sh_onlyerrors);
2N/A}
2N/A
2N/A/*
2N/A * This is a debugging tool that allows applications to dump out the current
2N/A * pending list or otherwise manipulate it. Because it's only for debugging
2N/A * purposes, it can leave the pending list in an arbitrary invalid state is
2N/A * something fails (i.e. memory allocation).
2N/A */
2N/Aint
2N/Ashadow_migrate_iter(shadow_handle_t *shp, void (*func)(const char *, void *),
2N/A void *data)
2N/A{
2N/A shadow_entry_t *sep;
2N/A shadow_pq_t copy;
2N/A
2N/A if (shadow_check_begin(shp) != 0)
2N/A return (-1);
2N/A
2N/A if (shadow_pq_init(&copy, shadow_priority) != 0) {
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A return (-1);
2N/A }
2N/A
2N/A while ((sep = shadow_pq_dequeue(&shp->sh_queue)) != NULL) {
2N/A if (shadow_pq_enqueue(&copy, sep) != 0) {
2N/A free(sep->se_path);
2N/A free(sep);
2N/A goto error;
2N/A }
2N/A
2N/A func(sep->se_path, data);
2N/A }
2N/A
2N/Aerror:
2N/A while ((sep = shadow_pq_dequeue(&copy)) != NULL) {
2N/A if (shadow_pq_enqueue(&shp->sh_queue, sep) != 0) {
2N/A free(sep->se_path);
2N/A free(sep);
2N/A }
2N/A }
2N/A shadow_pq_fini(&copy);
2N/A (void) pthread_mutex_unlock(&shp->sh_lock);
2N/A
2N/A return (0);
2N/A}
2N/A
2N/A/*
2N/A * Cleanup after a completed shadow migration. This is identical to
2N/A * shadow_cancel() except that it verifies that the migration is complete.
2N/A */
2N/Aint
2N/Ashadow_migrate_finalize(shadow_handle_t *shp)
2N/A{
2N/A if (!shadow_migrate_done(shp))
2N/A return (shadow_error(ESHADOW_MIGRATE_BUSY,
2N/A dgettext(TEXT_DOMAIN, "migration is not complete")));
2N/A
2N/A return (shadow_cancel(shp));
2N/A}
2N/A
2N/A/*
2N/A * This is a debugging-only tool that makes it easier to simulate
2N/A * ESHADOW_MIGRATE_BUSY by suspending shadow_migrate_one() before migrating the
2N/A * file or directory. This should not be used by production software - if
2N/A * there needs to be throttling done, it should be implemented by the caller
2N/A * invoking shadow_migrate_one() on a less frequent basis. The delay is
2N/A * specified in milliseconds.
2N/A */
2N/Avoid
2N/Ashadow_migrate_delay(shadow_handle_t *shp, uint32_t delay)
2N/A{
2N/A shp->sh_delay = delay;
2N/A}