dsl_dataset.c revision 1db4218334935d2169b128e6feb0c1ff134446fe
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dmu_traverse.h>
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_znode.h>
static char *dsl_reaper = "the grim reaper";
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
* partially accounted for in our ancestors.
*/
static int64_t
{
if (ds->ds_reserved == 0)
return (delta);
}
void
{
/* It could have been compressed away to nothing */
if (BP_IS_HOLE(bp))
return;
/*
* Account for the meta-objset space in its placeholder
* dsl_dir.
*/
return;
}
}
int
{
/* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
return (0);
int err;
/*
* Account for the meta-objset space in its placeholder
* dataset.
*/
return (used);
}
int err;
} else {
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
}
}
}
return (used);
}
{
return (0);
/*
* The snapshot creation could fail, but that would cause an
* incorrect FALSE return, which would only result in an
* overestimation of the amount of space that an operation would
* consume, which is OK.
*
* There's also a small window where we could miss a pending
* snapshot, because we could set the sync task in the quiescing
* phase. So this should only be used as a guess.
*/
if (ds->ds_trysnap_txg >
}
int
{
}
/* ARGSUSED */
static void
{
}
}
static int
{
int err;
if (ds->ds_snapname[0])
return (0);
return (0);
if (err)
return (err);
return (err);
}
static int
{
int err;
else
return (err);
}
static int
{
int err;
else
return (err);
}
static int
dsl_dataset_t **dsp)
{
int err;
if (err)
return (err);
NULL);
if (err == 0) {
}
if (err) {
/*
* we don't really need to close the blist if we
* just opened it.
*/
return (err);
}
if (!dsl_dataset_is_snapshot(ds)) {
}
if (err == 0) {
ds->ds_origin_txg =
}
}
} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
}
/*
* In sync context, we're called with either no lock
* or with the write lock. If we're not syncing,
* we're always called with the read lock held.
*/
if (need_lock)
if (err == 0) {
}
if (need_lock)
} else {
}
if (err == 0) {
}
if (err) {
return (err);
}
} else {
ds->ds_fsid_guid =
}
}
return (ENOENT);
}
return (0);
}
static int
{
/*
* In syncing context we don't want the rwlock lock: there
* may be an existing writer waiting for sync phase to
* finish. We don't need to worry about such writers, since
* sync phase is single-threaded, so the writer can't be
* doing anything while we are active.
*/
if (dsl_pool_sync_context(dp)) {
return (0);
}
/*
* Normal users will hold the ds_rwlock as a READER until they
* are finished (i.e., call dsl_dataset_rele()). "Owners" will
* drop their READER lock after they set the ds_owner field.
*
* If the dataset is being destroyed, the destroy thread will
* obtain a WRITER lock for exclusive access after it's done its
* open-context work and then change the ds_owner to
* dsl_reaper once destruction is assured. So threads
* may block here temporarily, until the "destructability" of
* the dataset is determined.
*/
if (DSL_DATASET_IS_DESTROYED(ds)) {
return (ENOENT);
}
}
return (0);
}
int
dsl_dataset_t **dsp)
{
if (err)
return (err);
}
int
dsl_dataset_t **dsp)
{
if (err)
return (err);
return (EBUSY);
}
return (0);
}
int
{
dsl_pool_t *dp;
const char *snapname;
int err = 0;
if (err)
return (err);
if (obj)
else
if (err)
goto out;
/* we may be looking for a snapshot */
if (*snapname++ != '@') {
goto out;
}
if (err == 0)
if (ds) {
if (ds->ds_snapname[0] == 0)
sizeof (ds->ds_snapname));
}
}
out:
return (err);
}
int
{
if (err)
return (err);
!DS_MODE_IS_READONLY(flags)) {
return (EROFS);
}
return (EBUSY);
}
return (0);
}
void
{
} else {
if (ds->ds_snapname[0]) {
/*
* We use a "recursive" mutex so that we
* can call dprintf_ds() with ds_lock held.
*/
} else {
}
}
}
}
static int
{
int result;
} else {
if (ds->ds_snapname[0]) {
++result; /* adding one for the @-sign */
} else {
}
}
}
return (result);
}
void
{
}
void
{
}
}
void
{
}
else
}
{
}
return (gotit);
}
void
{
}
{
DMU_OT_NONE, 0, tx);
if (origin) {
}
}
}
return (dsobj);
}
{
return (dsobj);
}
struct destroyarg {
char *snapname;
char *failed;
};
static int
{
char *cp;
int err;
*cp = '\0';
if (err == 0) {
if (ds->ds_user_ptr) {
}
err = 0;
} else {
}
return (err);
}
/*
* Destroy 'snapname' in all descendants of 'fsname'.
*/
int
{
int err;
struct destroyarg da;
if (err)
return (err);
if (err == 0)
/*
* Return the file system name that triggered the error
*/
}
}
return (err);
}
/*
* ds must be opened as OWNER. On return (whether successful or not),
* ds will be closed and caller can no longer dereference it.
*/
int
{
int err;
if (dsl_dataset_is_snapshot(ds)) {
/* Destroying a snapshot is simpler */
if (ds->ds_user_ptr) {
}
goto out;
}
/*
* Check for errors and mark this ds as inconsistent, in
* case we crash while freeing the objects.
*/
if (err)
goto out;
if (err)
goto out;
/*
* remove the objects in open context, so that we won't
* have too much to do in syncing context.
*/
/*
* Ignore errors, if there is not enough disk space
* we will deal with it in dsl_dataset_destroy_sync().
*/
}
goto out;
if (err)
goto out;
if (ds->ds_user_ptr) {
/*
* We need to sync out all in-flight IO before we try
* to evict (the dataset evict func is trying to clear
* the cached entries for this dataset in the ARC).
*/
}
/*
* Blow away the dsl_dir + head dataset.
*/
if (ds->ds_user_ptr) {
}
/* if it is successful, dsl_dir_destroy_sync will close the dd */
if (err)
out:
return (err);
}
int
{
int err;
/* drop exclusive access */
return (err);
}
void *
void *p, dsl_dataset_evict_func_t func)
{
void *old;
ds->ds_user_ptr = p;
}
return (old);
}
void *
{
return (ds->ds_user_ptr);
}
blkptr_t *
{
}
void
{
/* If it's the meta-objset, set dp_meta_rootbp */
} else {
}
}
spa_t *
{
}
void
{
dsl_pool_t *dp;
return;
panic("dirtying snapshot!");
/* up the hold count until we can be written out */
}
}
/*
* The unique space in the head dataset can be calculated by subtracting
* the space used in the most recent snapshot, that is still being used
* in this file system, from the space currently in use. To figure out
* the space in the most recent snapshot still in use, we need to take
* the total space used in the snapshot and subtract out the space that
* has been freed up since the snapshot was taken.
*/
static void
{
else
mrs_used = 0;
&dluncomp));
if (!DS_UNIQUE_IS_ACCURATE(ds) &&
}
static uint64_t
{
}
struct killarg {
};
/* ARGSUSED */
static int
{
return (0);
return (0);
}
/* ARGSUSED */
static int
{
/*
* We can only roll back to emptyness if it is a ZPL objset.
*/
return (EINVAL);
/*
* This must not be a snapshot.
*/
return (EINVAL);
/*
* If we made changes this txg, traverse_dataset won't find
* them. Try again.
*/
return (EAGAIN);
return (0);
}
/* ARGSUSED */
static void
{
/*
* Before the roll back destroy the zil.
*/
/*
* We need to make sure that the objset_impl_t is reopened after
* we do the rollback, otherwise it will have the wrong
* objset_phys_t. Normally this would happen when this
* dataset-open is closed, thus causing the
* dataset to be immediately evicted. But when doing "zfs recv
* -F", we reopen the objset before that, so that there is no
* window where the dataset is closed and inconsistent.
*/
}
/* Transfer space that was freed since last snap back to the head. */
{
}
/* Zero out the deadlist. */
{
/* Free blkptrs that we gave birth to */
}
/* Change our contents to that of the prev snapshot */
}
} else {
#ifdef _KERNEL
#endif
}
}
/* ARGSUSED */
static int
{
int err;
/*
* Can't delete a head dataset if there are snapshots of it.
* (Except if the only snapshots are from the branch we cloned
* from.)
*/
return (EINVAL);
/*
* This is really a dsl_dir thing, but check it here so that
* we'll be less likely to leave this dataset inconsistent &
* nearly destroyed.
*/
if (err)
return (err);
if (count != 0)
return (EEXIST);
return (0);
}
/* ARGSUSED */
static void
{
/* Mark it as inconsistent on-disk, in case we crash */
}
/* ARGSUSED */
int
{
/* we have an owner hold, so noone else can destroy us */
/* Can't delete a branch point. */
return (EEXIST);
/*
* Can't delete a head dataset if there are snapshots of it.
* (Except if the only snapshots are from the branch we cloned
* from.)
*/
return (EINVAL);
/*
* If we made changes this txg, traverse_dsl_dataset won't find
* them. Try again.
*/
return (EAGAIN);
/* XXX we should do some i/o error checking... */
return (0);
}
struct refsarg {
};
/* ARGSUSED */
static void
{
}
static void
{
}
void
{
int err;
int after_branch_point = FALSE;
/* signal any waiters that this dataset is going away */
/* Remove our reservation */
if (ds->ds_reserved != 0) {
}
} else {
}
if (after_branch_point &&
}
}
if (after_branch_point &&
/* This clone is toast. */
} else if (!after_branch_point) {
}
}
/*
* Transfer to our deadlist (which will become next's
* new deadlist) any entries from next's current
* deadlist which were born before prev, and free the
* other entries.
*
* XXX we're doing this long task with the config lock held
*/
if (ds_prev && !after_branch_point &&
}
} else {
/* XXX check return value? */
}
}
/* change snapused */
/* free next's deadlist */
/* set next's deadlist to our deadlist */
/*
* Update next's unique to include blocks which
* were previously shared by only this snapshot
* and it. Those blocks will be born after the
* prev snap and before this snap, and will have
* died after the next snap and before the one
* after that (ie. be on the snap after next's
* deadlist).
*
* XXX we're doing this long task with the
* config lock held
*/
FTAG, &ds_after_next));
VERIFY(0 ==
} else {
if (ds_prev) {
}
/*
* Reduce the amount of our unconsmed refreservation
* being charged to our parent by the amount of
* new unique data we have gained.
*/
}
}
} else {
/*
* There's no next snapshot, so this is a head dataset.
* Destroy the deadlist. Unless it's a clone, the
* deadlist should be empty. (If it's a clone, it's
* safe to ignore the deadlist contents.)
*/
/*
* Free everything that we point to (that's born after
* the previous snapshot, if we are a clone)
*
* NB: this should be very quick, because we already
* freed all the objects in open context.
*/
}
/* Erase the link in the dir */
} else {
/* remove from snapshot namespace */
#ifdef ZFS_DEBUG
{
}
#endif
}
}
}
static int
{
if (!dmu_tx_is_syncing(tx))
return (0);
/*
* If there's an fs-only reservation, any blocks that might become
* owned by the snapshot dataset must be accommodated by space
* outside of the reservation.
*/
return (ENOSPC);
/*
* Propogate any reserved space for this snapshot to other
* snapshot checks in this sync group.
*/
if (asize > 0)
return (0);
}
/* ARGSUSED */
int
{
int err;
/*
* We don't allow multiple snapshots of the same txg. If there
* is already one, try again.
*/
return (EAGAIN);
/*
* Check for conflicting name snapshot name.
*/
if (err == 0)
return (EEXIST);
return (err);
/*
* Check that the dataset's name is not too long. Name consists
* of the dataset's length + 1 for the @-sign + snapshot name's length
*/
return (ENAMETOOLONG);
if (err)
return (err);
return (0);
}
void
{
int err;
/*
* The origin's ds_creation_txg has to be < TXG_INITIAL
*/
crtxg = 1;
else
} else if (next_clones_obj != 0) {
}
}
/*
* If we have a reference-reservation on this dataset, we will
* need to increase the amount of refreservation being charged
* since our unique space is going to zero.
*/
if (ds->ds_reserved) {
}
"dataset = %llu", dsobj);
}
void
{
/*
* in case we had to change ds_fsid_guid when we opened it,
* sync it out now.
*/
}
void
{
ds->ds_reserved);
/*
* This is a snapshot; override the dd's space used with
* our unique space and compression ratio.
*/
}
}
void
{
} else {
stat->dds_num_clones = 0;
}
/* clone origin is really a dsl_dir thing... */
} else {
}
}
{
return (ds->ds_fsid_guid);
}
void
{
/*
* Adjust available bytes according to refquota
*/
else
*availbytesp = 0;
}
}
{
return (B_FALSE);
return (B_TRUE);
return (B_FALSE);
}
/* ARGSUSED */
static int
{
char *newsnapname = arg2;
int err;
if (err)
return (err);
/* new name better not be in use */
if (err == 0)
err = 0;
/* dataset name + 1 for the "@" + the new snapshot name must fit */
err = ENAMETOOLONG;
return (err);
}
static void
{
const char *newsnapname = arg2;
int err;
}
struct renamesnaparg {
char failed[MAXPATHLEN];
char *oldsnap;
char *newsnap;
};
static int
{
char *cp;
int err;
*cp = '@';
/*
* For recursive snapshot renames the parent won't be changing
*/
return (0);
} else if (err) {
return (err);
}
#ifdef _KERNEL
/*
* For all filesystems undergoing rename, we'll need to unmount it.
*/
#endif
*cp = '\0';
return (0);
} else if (err) {
return (err);
}
return (0);
}
static int
{
int err;
struct renamesnaparg *ra;
/* truncate the snapshot name to get the fsname */
*cp = '\0';
if (err) {
return (err);
}
if (err == 0) {
}
}
}
if (err)
return (err);
}
static int
{
return (ENAMETOOLONG);
return (0);
}
int
{
const char *tail;
int err;
if (err)
return (err);
/*
* If there are more than 2 references there may be holds
* hanging around that haven't been cleared out yet.
*/
/* if we're growing, validate child name lengths */
if (delta > 0)
if (!err)
return (err);
}
if (tail[0] != '@') {
/* the name ended in a nonexistant component */
return (ENOENT);
}
/* new name must be snapshot in same filesystem */
return (EINVAL);
tail++;
return (EXDEV);
if (recursive) {
} else {
if (err)
return (err);
}
return (err);
}
struct promotenode {
};
struct promotearg {
};
/* ARGSUSED */
static int
{
int err;
/* Check that it is a real clone */
return (EINVAL);
/* Since this is so expensive, don't do the preliminary check */
if (!dmu_tx_is_syncing(tx))
return (0);
return (EXDEV);
/* compute origin's new unique space */
if (err)
return (err);
/*
* Walk the snapshots that we are moving
*
* Compute space to transfer. Consider the incremental changes
* to used for each snapshot:
* (my used) = (prev's used) + (blocks born) - (blocks killed)
* So each snapshot gave birth to:
* (blocks born) = (my used) - (prev's used) + (blocks killed)
* So a sequence would look like:
* (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
* Which simplifies to:
* uN + kN + kN-1 + ... + k1 + k0
* Note however, if we stop before we reach the ORIGIN we get:
* uN + kN + kN-1 + ... + kM - uM-1
*/
/* Check that the snapshot name does not conflict */
if (err == 0)
return (EEXIST);
return (err);
/* The very first snapshot does not have a deadlist */
continue;
return (err);
}
/*
* If we are a clone of a clone then we never reached ORIGIN,
* so we need to subtract out the clone origin's used space.
*/
if (pa->origin_origin) {
}
/* Check that there is enough space here */
if (err)
return (err);
/*
* Compute the amounts of space that will be used by snapshots
* after the promotion (for both origin and clone). For each,
* it is the amount of space that will be on all of their
* deadlists (that was not born before their new origin).
*/
/*
* Note, typically this will not be a clone of a clone,
* so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
* these snaplist_space() -> bplist_space_birthrange()
* calls will be fast because they do not have to
* iterate over all bps.
*/
if (err)
return (err);
if (err)
return (err);
}
if (err)
return (err);
}
return (0);
}
static void
{
/*
* We need to explicitly open odd, since origin_ds's dd will be
* changing.
*/
/* change origin's next snap */
/* change the origin's next clone */
oldnext_obj, tx));
}
/* change origin */
/* move snapshots to this dir */
/* unregister props as dsl_dir is changing */
if (ds->ds_user_ptr) {
}
/* move snap name entry */
/* change containing dsl_dir */
}
/*
* Change space accounting.
* Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
* both be valid, or both be 0 (resulting in delta == 0). This
* is true for each of {clone,origin} independently.
*/
/* log history record */
}
static char *snaplist_tag = "snaplist";
/*
* Make a list of dsl_dataset_t's for the snapshots between first_obj
* (exclusive) and last_obj (inclusive). The list will be in reverse
* order (last_obj will be the list_head()). If first_obj == 0, do all
* snapshots back to this dataset's origin.
*/
static int
{
list_create(l, sizeof (struct promotenode),
struct promotenode *snap;
int err;
if (own) {
0, snaplist_tag, &ds);
if (err == 0)
} else {
}
/* lost race with snapshot destroy */
continue;
} else if (err) {
return (err);
}
if (first_obj == 0)
list_insert_tail(l, snap);
}
return (0);
}
static int
{
struct promotenode *snap;
*spacep = 0;
if (err)
return (err);
}
return (0);
}
static void
{
struct promotenode *snap;
if (!list_link_active(&l->list_head))
return;
list_remove(l, snap);
if (own)
else
}
list_destroy(l);
}
/*
* Promote a clone. Nomenclature note:
* "clone" or "cds": the original clone which is being promoted
* "origin" or "ods": the snapshot which is originally clone's origin
* "origin head" or "ohds": the dataset which is the head
* (filesystem/volume) for the origin
* "origin origin": the origin of the origin's filesystem (typically
* NULL, indicating that the clone is not a clone of a clone).
*/
int
dsl_dataset_promote(const char *name)
{
dsl_pool_t *dp;
struct promotearg pa = { 0 };
struct promotenode *snap;
int err;
if (err)
return (err);
if (err) {
return (err);
}
return (EINVAL);
}
/*
* We are going to inherit all the snapshots taken before our
* origin (i.e., our new origin will be our parent's origin).
* Take ownership of them so that we can rename them into our
* namespace.
*/
&pa.shared_snaps);
if (err != 0)
goto out;
if (err != 0)
goto out;
if (err != 0)
goto out;
if (err != 0)
goto out;
}
out:
/*
* Add in 128x the snapnames zapobj size, since we will be moving
* a bunch of snapnames to the promoted ds, and dirtying their
* bonus buffers.
*/
if (err == 0) {
}
if (pa.origin_origin)
return (err);
}
struct cloneswaparg {
};
/* ARGSUSED */
static int
{
/* they should both be heads */
return (EINVAL);
/* the branch point should be just before them */
return (EINVAL);
/* cds should be the clone */
return (EINVAL);
/* the clone should be a child of the origin */
return (EINVAL);
/* ohds shouldn't be modified unless 'force' */
return (ETXTBSY);
/* adjust amount of any unconsumed refreservation */
if (csa->unused_refres_delta > 0 &&
return (ENOSPC);
return (0);
}
/* ARGSUSED */
static void
{
}
}
/* reset origin's unique bytes */
/* swap blkptrs */
{
}
/* set dd_*_bytes */
{
dd_used_breakdown[DD_USED_SNAP], ==, 0);
&cdl_comp, &cdl_uncomp));
&odl_comp, &odl_uncomp));
/*
* The difference in the space used by snapshots is the
* difference in snapshot space due to the head's
* deadlist (since that's the only thing that's
* changing that affects the snapused).
*/
}
#define SWITCH64(x, y) \
{ \
(x) = (y); \
(y) = __tmp; \
}
/* swap ds_*_bytes */
/* apply any parent delta for change in unconsumed refreservation */
/* swap deadlists */
}
/*
* Swap 'clone' with its origin head file system. Used at the end
* of "online recv" to swizzle the file system to the new version.
*/
int
{
struct cloneswaparg csa;
int error;
/* Need exclusive access for the swap */
goto retry;
}
}
return (error);
}
/*
* Given a pool name and a dataset object number in that pool,
* return the name of that dataset.
*/
int
{
dsl_pool_t *dp;
int error;
return (error);
}
return (error);
}
int
{
int error = 0;
/*
* *ref_rsrv is the portion of asize that will come from any
* unconsumed refreservation space.
*/
*ref_rsrv = 0;
/*
* Make a space adjustment for reserved bytes.
*/
*ref_rsrv =
}
return (0);
}
/*
* If they are requesting more space, and our current estimate
* is over quota, they get to try again unless the actual
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
else
}
return (error);
}
/* ARGSUSED */
static int
{
return (ENOTSUP);
if (new_quota == 0)
return (0);
return (ENOSPC);
return (0);
}
/* ARGSUSED */
void
{
}
int
{
int err;
if (err)
return (err);
/*
* If someone removes a file, then tries to set the quota, we
* want to make sure the file freeing takes effect.
*/
}
return (err);
}
static int
{
if (new_reservation > INT64_MAX)
return (EOVERFLOW);
return (ENOTSUP);
if (dsl_dataset_is_snapshot(ds))
return (EINVAL);
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
*/
if (!dmu_tx_is_syncing(tx))
return (0);
if (delta > 0 &&
return (ENOSPC);
return (ENOSPC);
return (0);
}
/* ARGSUSED */
static void
{
}
int
{
int err;
if (err)
return (err);
return (err);
}