dsl_dataset.c revision b62969f868a827f0823a084bc0af9c7d8b76c659
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dmu_traverse.h>
#include <sys/dmu_impl.h>
#include <sys/zfeature.h>
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_onexit.h>
#include <sys/dsl_scan.h>
#include <sys/dsl_deadlist.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_userhold.h>
#define SWITCH64(x, y) \
{ \
(x) = (y); \
(y) = __tmp; \
}
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
* partially accounted for in our ancestors.
*/
static int64_t
{
if (ds->ds_reserved == 0)
return (delta);
}
void
{
/* It could have been compressed away to nothing */
if (BP_IS_HOLE(bp))
return;
return;
}
}
int
{
if (BP_IS_HOLE(bp))
return (0);
return (used);
}
} else {
if (async) {
/*
* We are here as part of zio's write done callback,
* which means we're a zio interrupt thread. We can't
* call dsl_deadlist_insert() now because it may block
* waiting for I/O. Instead, put bp on the deferred
* queue and let dsl_pool_sync() finish the job.
*/
} else {
}
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
}
}
}
return (used);
}
{
return (0);
/*
* The snapshot creation could fail, but that would cause an
* incorrect FALSE return, which would only result in an
* overestimation of the amount of space that an operation would
* consume, which is OK.
*
* There's also a small window where we could miss a pending
* snapshot, because we could set the sync task in the quiescing
* phase. So this should only be used as a guess.
*/
if (ds->ds_trysnap_txg >
}
{
return (B_FALSE);
return (B_TRUE);
}
/* ARGSUSED */
static void
{
}
}
int
{
int err;
if (ds->ds_snapname[0])
return (0);
return (0);
if (err != 0)
return (err);
return (err);
}
int
{
int err;
else
return (err);
}
int
{
int err;
else
return (err);
}
int
dsl_dataset_t **dsp)
{
int err;
if (err != 0)
return (err);
/* Make sure dsobj has the correct object type. */
}
if (err == 0) {
}
if (err != 0) {
return (err);
}
if (!dsl_dataset_is_snapshot(ds)) {
}
} else {
if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
&ds->ds_userrefs);
}
}
&ds->ds_reserved);
if (err == 0) {
}
} else {
}
if (err != 0) {
return (err);
}
} else {
ds->ds_fsid_guid =
}
}
return (0);
}
int
{
const char *snapname;
int err = 0;
if (err != 0)
return (err);
if (obj != 0)
else
/* we may be looking for a snapshot */
if (*snapname++ != '@') {
}
if (err == 0)
if (err == 0) {
if (ds->ds_snapname[0] == 0)
sizeof (ds->ds_snapname));
}
}
return (err);
}
int
{
if (err != 0)
return (err);
}
return (0);
}
int
{
if (err != 0)
return (err);
}
return (0);
}
/*
* See the comment above dsl_pool_hold() for details. In summary, a long
* hold is used to prevent destruction of a dataset while the pool hold
* is dropped, allowing other concurrent operations (e.g. spa_sync()).
*
* The dataset and pool must be held when this function is called. After it
* is called, the pool hold may be released while the dataset is still held
* and accessed.
*/
void
{
}
void
{
}
/* Return B_TRUE if there are any long holds on this dataset. */
{
}
void
{
} else {
if (ds->ds_snapname[0]) {
/*
* We use a "recursive" mutex so that we
* can call dprintf_ds() with ds_lock held.
*/
} else {
}
}
}
}
void
{
}
void
{
else
}
{
}
return (gotit);
}
{
DMU_OT_NONE, 0, tx);
} else {
}
}
}
}
}
return (dsobj);
}
static void
{
}
{
/*
* If we are creating a clone, make sure we zero out any stale
* data from the origin snapshots zil header.
*/
}
return (dsobj);
}
/*
* The unique space in the head dataset can be calculated by subtracting
* the space used in the most recent snapshot, that is still being used
* in this file system, from the space currently in use. To figure out
* the space in the most recent snapshot still in use, we need to take
* the total space used in the snapshot and subtract out the space that
* has been freed up since the snapshot was taken.
*/
void
{
else
mrs_used = 0;
}
void
{
int err;
/*
* The err should not be ENOENT, but a bug in a previous version
* of the code could cause upgrade_clones_cb() to not set
* ds_next_snap_obj when it should, leading to a missing entry.
* If we knew that the pool was created after
* SPA_VERSION_NEXT_CLONES, we could assert that it isn't
* ENOENT. However, at least we can check that we don't have
* too many entries in the next_clones_obj even after failing to
* remove this one.
*/
&count));
}
blkptr_t *
{
}
void
{
/* If it's the meta-objset, set dp_meta_rootbp */
} else {
}
}
spa_t *
{
}
void
{
dsl_pool_t *dp;
return;
panic("dirtying snapshot!");
/* up the hold count until we can be written out */
}
}
{
for (int t = 0; t < TXG_SIZE; t++) {
ds, t))
return (B_TRUE);
}
return (B_FALSE);
}
static int
{
if (!dmu_tx_is_syncing(tx))
return (0);
/*
* If there's an fs-only reservation, any blocks that might become
* owned by the snapshot dataset must be accommodated by space
* outside of the reservation.
*/
/*
* Propagate any reserved space for this snapshot to other
* snapshot checks in this sync group.
*/
if (asize > 0)
return (0);
}
typedef struct dsl_dataset_snapshot_arg {
int
{
int error;
if (!dmu_tx_is_syncing(tx))
return (0);
/*
* We don't allow multiple snapshots of the same txg. If there
* is already one, try again.
*/
/*
* Check for conflicting snapshot name.
*/
if (error == 0)
return (error);
/*
* We don't allow taking snapshots of inconsistent datasets, such as
* those into which we are currently receiving. However, if we are
* creating this snapshot as part of a receive, this check will be
* executed atomically with respect to the completion of the receive
* itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
* case we ignore this, knowing it will be fixed up for us shortly in
* dmu_recv_end_sync().
*/
if (error != 0)
return (error);
return (0);
}
static int
{
int rv = 0;
int error = 0;
char dsname[MAXNAMELEN];
if (error == 0) {
if (error == 0)
}
if (error == 0)
if (error == 0) {
}
if (error != 0) {
}
}
}
return (rv);
}
void
{
static zil_header_t zero_zil;
/*
* If we are on an old pool, the zil must not be active, in which
* case it will be zeroed. Usually zil_suspend() accomplishes this.
*/
sizeof (zero_zil)) == 0);
/*
* The origin's ds_creation_txg has to be < TXG_INITIAL
*/
crtxg = 1;
else
} else if (next_clones_obj != 0) {
}
}
/*
* If we have a reference-reservation on this dataset, we will
* need to increase the amount of refreservation being charged
* since our unique space is going to zero.
*/
if (ds->ds_reserved) {
}
}
static void
{
char dsname[MAXNAMELEN];
}
}
}
/*
* The snapshots must all be in the same pool.
* All-or-nothing: if there are any failures, nothing will be modified.
*/
int
{
int error;
char *firstname;
return (0);
if (error != 0)
return (error);
if (needsuspend) {
suspended = fnvlist_alloc();
char fsname[MAXNAMELEN];
char *atp;
void *cookie;
break;
}
if (error != 0)
break;
}
}
if (error == 0) {
}
zil_resume((void *)(uintptr_t)
}
}
return (error);
}
typedef struct dsl_dataset_snapshot_tmp_arg {
const char *ddsta_fsname;
const char *ddsta_snapname;
const char *ddsta_htag;
static int
{
int error;
if (error != 0)
return (error);
if (error != 0) {
return (error);
}
}
if (error != 0) {
return (error);
}
return (0);
}
static void
{
}
int
{
int error;
void *cookie;
if (error != 0)
return (error);
if (needsuspend) {
if (error != 0)
return (error);
}
if (needsuspend)
return (error);
}
void
{
/*
* in case we had to change ds_fsid_guid when we opened it,
* sync it out now.
*/
}
static void
{
/*
* There may be missing entries in ds_next_clones_obj
* due to a bug in a previous version of the code.
* Only trust it if it has the right number of entries.
*/
&count));
}
goto fail;
zap_cursor_advance(&zc)) {
char buf[ZFS_MAXNAMELEN];
}
fail:
}
void
{
if (dsl_dataset_is_snapshot(ds)) {
} else {
}
ds->ds_reserved);
ds->ds_userrefs);
if (err == 0) {
if (err == 0) {
written);
}
}
}
}
void
{
if (dsl_dataset_is_snapshot(ds)) {
} else {
stat->dds_num_clones = 0;
}
}
}
{
return (ds->ds_fsid_guid);
}
void
{
/*
* Adjust available bytes according to refquota
*/
else
*availbytesp = 0;
}
}
{
return (B_FALSE);
/*
* It may be that only the ZIL differs, because it was
* reset in the head. Don't count that as being
* modified.
*/
return (B_TRUE);
return (B_TRUE);
}
return (B_FALSE);
}
typedef struct dsl_dataset_rename_snapshot_arg {
const char *ddrsa_fsname;
const char *ddrsa_oldsnapname;
const char *ddrsa_newsnapname;
/* ARGSUSED */
static int
{
int error;
if (error != 0) {
/* ignore nonexistent snapshots */
}
/* new name should not exist */
if (error == 0)
error = 0;
/* dataset name + 1 for the "@" + the new snapshot name must fit */
return (error);
}
static int
{
int error;
if (error != 0)
return (error);
if (ddrsa->ddrsa_recursive) {
} else {
}
return (error);
}
static int
{
int error;
/* ignore nonexistent snapshots */
return (0);
}
/* log before we change the name */
return (0);
}
static void
{
if (ddrsa->ddrsa_recursive) {
} else {
}
}
int
dsl_dataset_rename_snapshot(const char *fsname,
{
}
/*
* If we're doing an ownership handoff, we need to make sure that there is
* only one long hold on the dataset. We're not allowed to change anything here
* so we don't permanently release the long hold or regular hold here. We want
* to do this only when syncing to avoid the dataset unexpectedly going away
* when we release the long hold.
*/
static int
{
if (!dmu_tx_is_syncing(tx))
return (0);
}
if (held)
return (0);
}
typedef struct dsl_dataset_rollback_arg {
const char *ddra_fsname;
void *ddra_owner;
static int
{
int error;
if (error != 0)
return (error);
/* must not be a snapshot */
if (dsl_dataset_is_snapshot(ds)) {
}
/* must have a most recent snapshot */
}
if (error != 0) {
return (error);
}
/*
* Check if the snap we are rolling back to uses more than
* the refquota.
*/
}
/*
* When we do the clone swap, we will temporarily use more space
* due to the refreservation (the head will no longer have any
* unique space, so the entire amount of the refreservation will need
* to be free). We will immediately destroy the clone, freeing
* this space, but the freeing happens over many txg's.
*/
if (unused_refres_delta > 0 &&
}
return (0);
}
static void
{
char namebuf[ZFS_MAXNAMELEN];
}
/*
* Rolls back the given filesystem or volume to the most recent snapshot.
* The name of the most recent snapshot will be returned under key "target"
* in the result nvlist.
*
* If owner != NULL:
* - The existing dataset MUST be owned by the specified owner at entry
* - Upon return, dataset will still be held by the same owner, whether we
* succeed or not.
*
* This mode is required any time the existing filesystem is mounted. See
* notes above zfs_suspend_fs() for further details.
*/
int
{
}
struct promotenode {
};
typedef struct dsl_dataset_promote_arg {
const char *ddpa_clonename;
char *err_ds;
void *tag);
static int
{
struct promotenode *snap;
int err;
if (err != 0)
return (err);
}
/*
* Compute and check the amount of space to transfer. Since this is
* so expensive, don't do the preliminary check.
*/
if (!dmu_tx_is_syncing(tx)) {
return (0);
}
/* compute origin's new unique space */
/*
* Walk the snapshots that we are moving
*
* Compute space to transfer. Consider the incremental changes
* to used by each snapshot:
* (my used) = (prev's used) + (blocks born) - (blocks killed)
* So each snapshot gave birth to:
* (blocks born) = (my used) - (prev's used) + (blocks killed)
* So a sequence would look like:
* (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
* Which simplifies to:
* uN + kN + kN-1 + ... + k1 + k0
* Note however, if we stop before we reach the ORIGIN we get:
* uN + kN + kN-1 + ... + kM - uM-1
*/
/*
* If there are long holds, we won't be able to evict
* the objset.
*/
if (dsl_dataset_long_held(ds)) {
goto out;
}
/* Check that the snapshot name does not conflict */
if (err == 0) {
goto out;
}
goto out;
/* The very first snapshot does not have a deadlist */
continue;
}
/*
* If we are a clone of a clone then we never reached ORIGIN,
* so we need to subtract out the clone origin's used space.
*/
if (ddpa->origin_origin) {
}
/* Check that there is enough space here */
if (err != 0)
goto out;
/*
* Compute the amounts of space that will be used by snapshots
* after the promotion (for both origin and clone). For each,
* it is the amount of space that will be on all of their
* deadlists (that was not born before their new origin).
*/
/*
* Note, typically this will not be a clone of a clone,
* so dd_origin_txg will be < TXG_INITIAL, so
* these snaplist_space() -> dsl_deadlist_space_range()
* calls will be fast because they do not have to
* iterate over all bps.
*/
if (err != 0)
goto out;
if (err != 0)
goto out;
}
if (err != 0)
goto out;
}
out:
return (err);
}
static void
{
struct promotenode *snap;
/*
* We need to explicitly open odd, since origin_ds's dd will be
* changing.
*/
/* change origin's next snap */
/* change the origin's next clone */
oldnext_obj, tx));
}
/* change origin */
/* change dd_clone entries */
}
}
/* move snapshots to this dir */
/*
* Property callbacks are registered to a particular
* dsl_dir. Since ours is changing, evict the objset
* so that they will be unregistered from the old dsl_dir.
*/
}
/* move snap name entry */
/* change containing dsl_dir */
/* move any clone references */
zap_cursor_advance(&zc)) {
uint64_t o;
/*
* We've already moved the
* origin's reference.
*/
continue;
}
}
}
}
/*
* Change space accounting.
* Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
* both be valid, or both be 0 (resulting in delta == 0). This
* is true for each of {clone,origin} independently.
*/
/* log history record */
}
/*
* Make a list of dsl_dataset_t's for the snapshots between first_obj
* (exclusive) and last_obj (inclusive). The list will be in reverse
* order (last_obj will be the list_head()). If first_obj == 0, do all
* snapshots back to this dataset's origin.
*/
static int
{
list_create(l, sizeof (struct promotenode),
struct promotenode *snap;
int err;
if (err != 0)
return (err);
if (first_obj == 0)
list_insert_tail(l, snap);
}
return (0);
}
static int
{
struct promotenode *snap;
*spacep = 0;
}
return (0);
}
static void
{
struct promotenode *snap;
return;
list_remove(l, snap);
}
list_destroy(l);
}
static int
{
int error;
struct promotenode *snap;
&ddpa->ddpa_clone);
if (error != 0)
return (error);
!dsl_dir_is_clone(dd)) {
}
if (error != 0)
goto out;
if (error != 0)
goto out;
if (error != 0)
goto out;
if (error != 0)
goto out;
}
out:
if (error != 0)
return (error);
}
static void
{
}
/*
* Promote a clone.
*
* If it fails due to a conflicting snapshot name, "conflsnap" will be filled
* in with the name. (It must be at least MAXNAMELEN bytes long.)
*/
int
{
dsl_dataset_promote_arg_t ddpa = { 0 };
int error;
/*
* We will modify space proportional to the number of
* snapshots. Compute numsnaps.
*/
if (error != 0)
return (error);
if (error != 0)
return (error);
}
int
{
/* they should both be heads */
if (dsl_dataset_is_snapshot(clone) ||
/* if we are not forcing, the branch point should be just before them */
/* clone should be the clone (unless they are unrelated) */
/* the clone should be a child of the origin */
/* origin_head shouldn't be modified unless 'force' */
if (!force &&
/* origin_head should have no long holds (e.g. is not mounted) */
/* check amount of any unconsumed refreservation */
if (unused_refres_delta > 0 &&
/* clone can't be over the head's refquota */
if (origin_head->ds_quota != 0 &&
return (0);
}
void
{
}
}
/*
* Reset origin's unique bytes, if it exists.
*/
}
/* swap blkptrs */
{
}
/* set dd_*_bytes */
{
dd_used_breakdown[DD_USED_SNAP], ==, 0);
/*
* The difference in the space used by snapshots is the
* difference in snapshot space due to the head's
* deadlist (since that's the only thing that's
* changing that affects the snapused).
*/
}
/* swap ds_*_bytes */
/* apply any parent delta for change in unconsumed refreservation */
unused_refres_delta, 0, 0, tx);
/*
* Swap deadlists.
*/
}
/*
* Given a pool name and a dataset object number in that pool,
* return the name of that dataset.
*/
int
{
dsl_pool_t *dp;
int error;
if (error != 0)
return (error);
if (error == 0) {
}
return (error);
}
int
{
int error = 0;
/*
* *ref_rsrv is the portion of asize that will come from any
* unconsumed refreservation space.
*/
*ref_rsrv = 0;
/*
* Make a space adjustment for reserved bytes.
*/
*ref_rsrv =
}
return (0);
}
/*
* If they are requesting more space, and our current estimate
* is over quota, they get to try again unless the actual
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
if (inflight > 0 ||
else
}
return (error);
}
typedef struct dsl_dataset_set_qr_arg {
const char *ddsqra_name;
/* ARGSUSED */
static int
{
int error;
if (error != 0)
return (error);
if (dsl_dataset_is_snapshot(ds)) {
}
if (error != 0) {
return (error);
}
if (newval == 0) {
return (0);
}
}
return (0);
}
static void
{
}
}
int
{
dsl_dataset_set_refquota_sync, &ddsqra, 0));
}
static int
{
int error;
if (error != 0)
return (error);
if (dsl_dataset_is_snapshot(ds)) {
}
if (error != 0) {
return (error);
}
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
*/
if (!dmu_tx_is_syncing(tx)) {
return (0);
}
if (!DS_UNIQUE_IS_ACCURATE(ds))
if (delta >
}
}
return (0);
}
void
{
}
static void
{
}
int
{
}
/*
* Return (in *usedp) the amount of space written in new that is not
* present in oldsnap. New may be a snapshot or the head. Old must be
* a snapshot before new, in new's filesystem (or its origin). If not then
* fail and return EINVAL.
*
* The written space is calculated by considering two components: First, we
* ignore any freed space, and calculate the written as new's used space
* minus old's used space. Next, we add in the amount of space that was freed
* between the two snapshots, thus reducing new's used space relative to old's.
* Specifically, this is the space that was born before old->ds_creation_txg,
* and freed before new (ie. on new's deadlist or a previous deadlist).
*
* space freed [---------------------]
* snapshots ---O-------O--------O-------O------
* oldsnap new
*/
int
{
int err = 0;
*usedp = 0;
*compp = 0;
*uncompp = 0;
} else {
if (err != 0)
break;
}
/*
* The blocks in the deadlist can not be born after
* ds_prev_snap_txg, so get the whole deadlist space,
* which is more efficient (especially for old-format
* deadlists). Unfortunately the deadlist code
* doesn't have enough information to make this
* optimization itself.
*/
} else {
}
/*
* If we get to the beginning of the chain of snapshots
* (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
*/
if (snapobj == 0) {
break;
}
}
return (err);
}
/*
* Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
* lastsnap, and all snapshots in between are deleted.
*
* blocks that would be freed [---------------------------]
* snapshots ---O-------O--------O-------O--------O
* firstsnap lastsnap
*
* This is the set of blocks that were born after the snap before firstsnap,
* (birth > firstsnap->prev_snap_txg) and died before the snap after the
* last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
* We calculate this by iterating over the relevant deadlists (from the snap
* after lastsnap, backward to the snap after firstsnap), summing up the
* space on the deadlist that was born after the snap before firstsnap.
*/
int
{
int err = 0;
/*
* Check that the snapshots are in the same dsl_dir, and firstsnap
* is before lastsnap.
*/
if (err != 0)
break;
}
return (err);
}
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
* 'earlier' is before 'later'. Or 'earlier' could be the origin of
* 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's
* filesystem. Or 'earlier' could be the origin's origin.
*/
{
int error;
return (B_FALSE);
return (B_TRUE);
return (B_FALSE);
return (B_TRUE);
if (error != 0)
return (B_FALSE);
return (ret);
}