dsl_dataset.c revision a0dc2951a04dc9cb13b8dd7ef6e4a23b48b6824f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dmu_traverse.h>
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_znode.h>
static char *dsl_reaper = "the grim reaper";
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
* partially accounted for in our ancestors.
*/
static int64_t
{
if (ds->ds_reserved == 0)
return (delta);
}
void
{
/* It could have been compressed away to nothing */
if (BP_IS_HOLE(bp))
return;
/*
* Account for the meta-objset space in its placeholder
* dsl_dir.
*/
return;
}
}
int
{
/* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
return (0);
int err;
/*
* Account for the meta-objset space in its placeholder
* dataset.
*/
return (used);
}
int err;
} else {
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
}
}
return (used);
}
{
return (0);
/*
* The snapshot creation could fail, but that would cause an
* incorrect FALSE return, which would only result in an
* overestimation of the amount of space that an operation would
* consume, which is OK.
*
* There's also a small window where we could miss a pending
* snapshot, because we could set the sync task in the quiescing
* phase. So this should only be used as a guess.
*/
if (ds->ds_trysnap_txg >
}
int
{
}
/* ARGSUSED */
static void
{
}
}
static int
{
int err;
if (ds->ds_snapname[0])
return (0);
return (0);
if (err)
return (err);
return (err);
}
static int
{
int err;
else
return (err);
}
static int
{
int err;
else
return (err);
}
static int
dsl_dataset_t **dsp)
{
int err;
if (err)
return (err);
NULL);
if (err == 0) {
}
if (err) {
/*
* we don't really need to close the blist if we
* just opened it.
*/
return (err);
}
}
} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
}
if (!dsl_dataset_is_snapshot(ds)) {
/*
* In sync context, we're called with either no lock
* or with the write lock. If we're not syncing,
* we're always called with the read lock held.
*/
if (need_lock)
if (err == 0) {
}
if (need_lock)
} else {
}
if (err == 0) {
}
if (err) {
return (err);
}
} else {
ds->ds_fsid_guid =
}
}
return (ENOENT);
}
return (0);
}
static int
{
/*
* In syncing context we don't want the rwlock lock: there
* may be an existing writer waiting for sync phase to
* finish. We don't need to worry about such writers, since
* sync phase is single-threaded, so the writer can't be
* doing anything while we are active.
*/
if (dsl_pool_sync_context(dp)) {
return (0);
}
/*
* Normal users will hold the ds_rwlock as a READER until they
* are finished (i.e., call dsl_dataset_rele()). "Owners" will
* drop their READER lock after they set the ds_owner field.
*
* If the dataset is being destroyed, the destroy thread will
* obtain a WRITER lock for exclusive access after it's done its
* open-context work and then change the ds_owner to
* dsl_reaper once destruction is assured. So threads
* may block here temporarily, until the "destructability" of
* the dataset is determined.
*/
if (DSL_DATASET_IS_DESTROYED(ds)) {
return (ENOENT);
}
}
return (0);
}
int
dsl_dataset_t **dsp)
{
if (err)
return (err);
}
int
dsl_dataset_t **dsp)
{
if (err)
return (err);
return (EBUSY);
}
return (0);
}
int
{
dsl_pool_t *dp;
const char *snapname;
int err = 0;
if (err)
return (err);
if (obj)
else
if (err)
goto out;
/* we may be looking for a snapshot */
if (*snapname++ != '@') {
goto out;
}
if (err == 0)
if (ds) {
if (ds->ds_snapname[0] == 0)
sizeof (ds->ds_snapname));
}
}
out:
return (err);
}
int
{
if (err)
return (err);
!DS_MODE_IS_READONLY(flags)) {
return (EROFS);
}
return (EBUSY);
}
return (0);
}
void
{
} else {
if (ds->ds_snapname[0]) {
/*
* We use a "recursive" mutex so that we
* can call dprintf_ds() with ds_lock held.
*/
} else {
}
}
}
}
static int
{
int result;
} else {
if (ds->ds_snapname[0]) {
++result; /* adding one for the @-sign */
} else {
}
}
}
return (result);
}
void
{
}
void
{
}
}
void
{
}
else
}
{
}
return (gotit);
}
void
{
}
{
DMU_OT_NONE, 0, tx);
if (origin) {
}
}
}
return (dsobj);
}
{
return (dsobj);
}
struct destroyarg {
char *snapname;
char *failed;
};
static int
{
char *cp;
int err;
*cp = '\0';
if (err == 0) {
if (ds->ds_user_ptr) {
}
err = 0;
} else {
}
return (err);
}
/*
* Destroy 'snapname' in all descendants of 'fsname'.
*/
int
{
int err;
struct destroyarg da;
if (err)
return (err);
if (err == 0)
/*
* Return the file system name that triggered the error
*/
}
}
return (err);
}
/*
* ds must be opened as OWNER. On return (whether successful or not),
* ds will be closed and caller can no longer dereference it.
*/
int
{
int err;
if (dsl_dataset_is_snapshot(ds)) {
/* Destroying a snapshot is simpler */
if (ds->ds_user_ptr) {
}
goto out;
}
/*
* Check for errors and mark this ds as inconsistent, in
* case we crash while freeing the objects.
*/
if (err)
goto out;
if (err)
goto out;
/*
* remove the objects in open context, so that we won't
* have too much to do in syncing context.
*/
/*
* Ignore errors, if there is not enough disk space
* we will deal with it in dsl_dataset_destroy_sync().
*/
}
goto out;
if (err)
goto out;
if (ds->ds_user_ptr) {
/*
* We need to sync out all in-flight IO before we try
* to evict (the dataset evict func is trying to clear
* the cached entries for this dataset in the ARC).
*/
}
/*
* Blow away the dsl_dir + head dataset.
*/
if (ds->ds_user_ptr) {
}
/* if it is successful, dsl_dir_destroy_sync will close the dd */
if (err)
out:
return (err);
}
int
{
}
void *
void *p, dsl_dataset_evict_func_t func)
{
void *old;
ds->ds_user_ptr = p;
}
return (old);
}
void *
{
return (ds->ds_user_ptr);
}
blkptr_t *
{
}
void
{
/* If it's the meta-objset, set dp_meta_rootbp */
} else {
}
}
spa_t *
{
}
void
{
dsl_pool_t *dp;
return;
panic("dirtying snapshot!");
/* up the hold count until we can be written out */
}
}
/*
* The unique space in the head dataset can be calculated by subtracting
* the space used in the most recent snapshot, that is still being used
* in this file system, from the space currently in use. To figure out
* the space in the most recent snapshot still in use, we need to take
* the total space used in the snapshot and subtract out the space that
* has been freed up since the snapshot was taken.
*/
static void
{
else
mrs_used = 0;
&dluncomp));
if (!DS_UNIQUE_IS_ACCURATE(ds) &&
}
static uint64_t
{
}
struct killarg {
};
static int
{
/*
* Since this callback is not called concurrently, no lock is
* needed on the accounting values.
*/
/* XXX check for EIO? */
return (0);
}
/* ARGSUSED */
static int
{
/*
* We can only roll back to emptyness if it is a ZPL objset.
*/
return (EINVAL);
/*
* This must not be a snapshot.
*/
return (EINVAL);
/*
* If we made changes this txg, traverse_dsl_dataset won't find
* them. Try again.
*/
return (EAGAIN);
return (0);
}
/* ARGSUSED */
static void
{
/*
* Before the roll back destroy the zil.
*/
/*
* We need to make sure that the objset_impl_t is reopened after
* we do the rollback, otherwise it will have the wrong
* objset_phys_t. Normally this would happen when this
* dataset-open is closed, thus causing the
* dataset to be immediately evicted. But when doing "zfs recv
* -F", we reopen the objset before that, so that there is no
* window where the dataset is closed and inconsistent.
*/
}
/* Zero out the deadlist. */
{
/* Free blkptrs that we gave birth to */
/* only deduct space beyond any refreservation */
}
/* Change our contents to that of the prev snapshot */
}
} else {
/* Zero out our contents, recreate objset */
#ifdef _KERNEL
#endif
}
}
/* ARGSUSED */
static int
{
int err;
/*
* Can't delete a head dataset if there are snapshots of it.
* (Except if the only snapshots are from the branch we cloned
* from.)
*/
return (EINVAL);
/*
* This is really a dsl_dir thing, but check it here so that
* we'll be less likely to leave this dataset inconsistent &
* nearly destroyed.
*/
if (err)
return (err);
if (count != 0)
return (EEXIST);
return (0);
}
/* ARGSUSED */
static void
{
/* Mark it as inconsistent on-disk, in case we crash */
}
/* ARGSUSED */
int
{
/* we have an owner hold, so noone else can destroy us */
/* Can't delete a branch point. */
return (EEXIST);
/*
* Can't delete a head dataset if there are snapshots of it.
* (Except if the only snapshots are from the branch we cloned
* from.)
*/
return (EINVAL);
/*
* If we made changes this txg, traverse_dsl_dataset won't find
* them. Try again.
*/
return (EAGAIN);
/* XXX we should do some i/o error checking... */
return (0);
}
struct refsarg {
};
/* ARGSUSED */
static void
{
}
static void
{
}
void
{
int err;
int after_branch_point = FALSE;
/* signal any waiters that this dataset is going away */
/* Remove our reservation */
if (ds->ds_reserved != 0) {
}
} else {
}
if (after_branch_point &&
}
}
if (after_branch_point &&
/* This clone is toast. */
} else if (!after_branch_point) {
}
}
/*
* Transfer to our deadlist (which will become next's
* new deadlist) any entries from next's current
* deadlist which were born before prev, and free the
* other entries.
*
* XXX we're doing this long task with the config lock held
*/
if (ds_prev && !after_branch_point &&
}
} else {
/* XXX check return value? */
}
}
/* free next's deadlist */
/* set next's deadlist to our deadlist */
/*
* Update next's unique to include blocks which
* were previously shared by only this snapshot
* and it. Those blocks will be born after the
* prev snap and before this snap, and will have
* died after the next snap and before the one
* after that (ie. be on the snap after next's
* deadlist).
*
* XXX we're doing this long task with the
* config lock held
*/
FTAG, &ds_after_next));
itor = 0;
}
}
} else {
if (ds_prev) {
}
/*
* Reduce the amount of our unconsmed refreservation
* being charged to our parent by the amount of
* new unique data we have gained.
*/
0, 0, tx);
}
}
/*
* NB: unique_bytes might not be accurate for the head objset.
* Before SPA_VERSION 9, we didn't update its value when we
* deleted the most recent snapshot.
*/
} else {
/*
* There's no next snapshot, so this is a head dataset.
* Destroy the deadlist. Unless it's a clone, the
* deadlist should be empty. (If it's a clone, it's
* safe to ignore the deadlist contents.)
*/
/*
* Free everything that we point to (that's born after
* the previous snapshot, if we are a clone)
*
* XXX we're doing this long task with the config lock held
*/
}
/* Erase the link in the dir */
} else {
/* remove from snapshot namespace */
#ifdef ZFS_DEBUG
{
}
#endif
}
}
}
}
static int
{
if (!dmu_tx_is_syncing(tx))
return (0);
/*
* If there's an fs-only reservation, any blocks that might become
* owned by the snapshot dataset must be accommodated by space
* outside of the reservation.
*/
return (ENOSPC);
/*
* Propogate any reserved space for this snapshot to other
* snapshot checks in this sync group.
*/
if (asize > 0)
return (0);
}
/* ARGSUSED */
int
{
int err;
/*
* We don't allow multiple snapshots of the same txg. If there
* is already one, try again.
*/
return (EAGAIN);
/*
* Check for conflicting name snapshot name.
*/
if (err == 0)
return (EEXIST);
return (err);
/*
* Check that the dataset's name is not too long. Name consists
* of the dataset's length + 1 for the @-sign + snapshot name's length
*/
return (ENAMETOOLONG);
if (err)
return (err);
return (0);
}
void
{
int err;
/*
* The origin's ds_creation_txg has to be < TXG_INITIAL
*/
crtxg = 1;
else
} else if (next_clones_obj != 0) {
}
}
/*
* If we have a reference-reservation on this dataset, we will
* need to increase the amount of refreservation being charged
* since our unique space is going to zero.
*/
if (ds->ds_reserved) {
}
"dataset = %llu", dsobj);
}
void
{
/*
* in case we had to change ds_fsid_guid when we opened it,
* sync it out now.
*/
}
void
{
ds->ds_reserved);
/*
* This is a snapshot; override the dd's space used with
* our unique space and compression ratio.
*/
}
}
void
{
}
/* clone origin is really a dsl_dir thing... */
}
}
{
return (ds->ds_fsid_guid);
}
void
{
/*
* Adjust available bytes according to refquota
*/
else
*availbytesp = 0;
}
}
{
return (B_FALSE);
return (B_TRUE);
return (B_FALSE);
}
/* ARGSUSED */
static int
{
char *newsnapname = arg2;
int err;
if (err)
return (err);
/* new name better not be in use */
if (err == 0)
err = 0;
/* dataset name + 1 for the "@" + the new snapshot name must fit */
err = ENAMETOOLONG;
return (err);
}
static void
{
const char *newsnapname = arg2;
int err;
}
struct renamesnaparg {
char failed[MAXPATHLEN];
char *oldsnap;
char *newsnap;
};
static int
{
char *cp;
int err;
*cp = '@';
/*
* For recursive snapshot renames the parent won't be changing
*/
return (0);
} else if (err) {
return (err);
}
#ifdef _KERNEL
/*
* For all filesystems undergoing rename, we'll need to unmount it.
*/
#endif
*cp = '\0';
return (0);
} else if (err) {
return (err);
}
return (0);
}
static int
{
int err;
struct renamesnaparg *ra;
/* truncate the snapshot name to get the fsname */
*cp = '\0';
if (err) {
return (err);
}
if (err == 0) {
}
}
}
if (err)
return (err);
}
static int
{
return (ENAMETOOLONG);
return (0);
}
int
{
const char *tail;
int err;
if (err)
return (err);
/* if we're growing, validate child name lengths */
if (delta > 0)
if (!err)
return (err);
}
if (tail[0] != '@') {
/* the name ended in a nonexistant component */
return (ENOENT);
}
/* new name must be snapshot in same filesystem */
return (EINVAL);
tail++;
return (EXDEV);
if (recursive) {
} else {
if (err)
return (err);
}
return (err);
}
struct promotenode {
};
struct promotearg {
};
/* ARGSUSED */
static int
{
char *name;
int err;
/* Check that it is a real clone */
return (EINVAL);
/* Since this is so expensive, don't do the preliminary check */
if (!dmu_tx_is_syncing(tx))
return (0);
return (EXDEV);
/* find origin's new next ds */
newnext_ds = hds;
if (newnext_ds != hds)
if (err)
return (err);
newnext_ds = prev;
}
/* compute origin's new unique space */
}
if (newnext_ds != hds)
return (err);
/*
* Walk the snapshots that we are moving
*
* Compute space to transfer. Each snapshot gave birth to:
* (my used) - (prev's used) + (deadlist's used)
* So a sequence would look like:
* uN - u(N-1) + dN + ... + u1 - u0 + d1 + u0 - 0 + d0
* Which simplifies to:
* uN + dN + ... + d1 + d0
* Note however, if we stop before we reach the ORIGIN we get:
* uN + dN + ... + dM - uM-1
*/
do {
/* Check that the snapshot name does not conflict */
if (err == 0)
break;
err = 0;
/* The very first snapshot does not have a deadlist */
break;
}
/*
* If we are a clone of a clone then we never reached ORIGIN,
* so we need to subtract out the clone origin's used space.
*/
if (pa->clone_origin) {
}
/* Check that there is enough space here */
if (err == 0) {
}
return (err);
}
static void
{
char *name;
/*
* We need to explicitly open odd, since origin_ds's dd will be
* changing.
*/
/* change origin's next snap */
/* change the origin's next clone */
oldnext_obj, tx));
}
/* change origin */
/* move snapshots to this dir */
do {
/* unregister props as dsl_dir is changing */
if (ds->ds_user_ptr) {
}
/* move snap name entry */
/* change containing dsl_dir */
/* change space accounting */
/* log history record */
}
int
dsl_dataset_promote(const char *name)
{
dsl_pool_t *dp;
struct promotearg pa;
struct promotenode *snap;
int err;
if (err)
return (err);
if (err) {
return (err);
}
/*
* We are going to inherit all the snapshots taken before our
* origin (i.e., our new origin will be our parent's origin).
* Take ownership of them so that we can rename them into our
* namespace.
*/
while (snap_obj) {
/*
* NB: this would be handled by the below check for
* clone of a clone, but then we'd always own_obj() the
* $ORIGIN, thus causing unnecessary EBUSYs. We don't
* need to set pa.clone_origin because the $ORIGIN has
* no data to account for.
*/
if (dp->dp_origin_snap &&
break;
/* lost race with snapshot destroy */
continue;
} else if (err) {
goto out;
}
/*
* We could be a clone of a clone. If we reach our
* parent's branch point, we're done.
*/
if (last_snap &&
break;
}
}
if (err)
goto out;
/*
* Add in 128x the snapnames zapobj size, since we will be moving
* a bunch of snapnames to the promoted ds, and dirtying their
* bonus buffers.
*/
out:
}
if (pa.clone_origin)
return (err);
}
struct cloneswaparg {
};
/* ARGSUSED */
static int
{
/* they should both be heads */
return (EINVAL);
/* the branch point should be just before them */
return (EINVAL);
/* cds should be the clone */
return (EINVAL);
/* the clone should be a child of the origin */
return (EINVAL);
/* ohds shouldn't be modified unless 'force' */
return (ETXTBSY);
/* adjust amount of any unconsumed refreservation */
if (csa->unused_refres_delta > 0 &&
return (ENOSPC);
return (0);
}
/* ARGSUSED */
static void
{
int err;
}
}
/* compute unique space */
}
/* reset origin's unique bytes */
/* swap blkptrs */
{
}
/* set dd_*_bytes */
{
&cdl_comp, &cdl_uncomp));
&odl_comp, &odl_uncomp));
}
#define SWITCH64(x, y) \
{ \
(x) = (y); \
(y) = __tmp; \
}
/* swap ds_*_bytes */
/* apply any parent delta for change in unconsumed refreservation */
0, 0, tx);
/* swap deadlists */
}
/*
* Swap 'clone' with its origin head file system. Used at the end
* of "online recv" to swizzle the file system to the new version.
*/
int
{
struct cloneswaparg csa;
int error;
/* Need exclusive access for the swap */
goto retry;
}
}
return (error);
}
/*
* Given a pool name and a dataset object number in that pool,
* return the name of that dataset.
*/
int
{
dsl_pool_t *dp;
int error;
return (error);
}
return (error);
}
int
{
int error = 0;
/*
* *ref_rsrv is the portion of asize that will come from any
* unconsumed refreservation space.
*/
*ref_rsrv = 0;
/*
* Make a space adjustment for reserved bytes.
*/
*ref_rsrv =
}
return (0);
}
/*
* If they are requesting more space, and our current estimate
* is over quota, they get to try again unless the actual
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
else
}
return (error);
}
/* ARGSUSED */
static int
{
return (ENOTSUP);
if (new_quota == 0)
return (0);
return (ENOSPC);
return (0);
}
/* ARGSUSED */
void
{
}
int
{
int err;
if (err)
return (err);
/*
* If someone removes a file, then tries to set the quota, we
* want to make sure the file freeing takes effect.
*/
}
return (err);
}
static int
{
if (new_reservation > INT64_MAX)
return (EOVERFLOW);
return (ENOTSUP);
if (dsl_dataset_is_snapshot(ds))
return (EINVAL);
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
*/
if (!dmu_tx_is_syncing(tx))
return (0);
if (delta > 0 &&
return (ENOSPC);
return (ENOSPC);
return (0);
}
/* ARGSUSED */
static void
{
}
int
{
int err;
if (err)
return (err);
return (err);
}