dmu_send.c revision 643da460c8ca583e39ce053081754e24087f84c8
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
*/
#include <sys/dmu_impl.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_ioctl.h>
#include <sys/zio_checksum.h>
#include <sys/zfs_znode.h>
#include <zfs_fletcher.h>
#include <sys/zfs_onexit.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_bookmark.h>
#include <sys/zfeature.h>
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
int zfs_send_corrupt_data = B_FALSE;
static char *dmu_recv_tag = "dmu_recv_tag";
static const char *recv_clone_name = "%recv";
static int
{
}
static int
{
/*
* When we receive a free record, dbuf_free_range() assumes
* that the receiving system doesn't have any dbufs in the range
* being freed. This is always true because there is a one-record
* constraint: we only send one WRITE record for any given
* object+offset. We know that the one-record constraint is
* true because we always send data in increasing order by
* object,offset.
*
* If the increasing-order constraint ever changes, we should find
* another way to assert that the one-record constraint is still
* satisfied.
*/
/*
* If we are doing a non-incremental send, then there can't
* be any data in the dataset we're receiving into. Therefore
* a free record would simply be a no-op. Save space by not
* sending it to begin with.
*/
if (!dsp->dsa_incremental)
return (0);
length = -1ULL;
/*
* If there is a pending op, but it's not PENDING_FREE, push it out,
* since free block aggregation can only be done for blocks of the
* same type (i.e., DRR_FREE records can only be aggregated with
* other DRR_FREE records. DRR_FREEOBJECTS records can only be
* aggregated with other DRR_FREEOBJECTS records.
*/
sizeof (dmu_replay_record_t)) != 0)
}
/*
* There should never be a PENDING_FREE if length is -1
* (because dump_dnode is the only place where this
* function is called with a -1, and only after flushing
* any pending record).
*/
/*
* Check to see whether this free block can be aggregated
* with pending one.
*/
return (0);
} else {
/* not a continuation. Push out pending record */
sizeof (dmu_replay_record_t)) != 0)
}
}
/* create a FREE record and make it pending */
if (length == -1ULL) {
sizeof (dmu_replay_record_t)) != 0)
} else {
}
return (0);
}
static int
{
/*
* We send data in increasing object, offset order.
* See comment in dump_free() for details.
*/
/*
* If there is any kind of pending aggregation (currently either
* a grouping of free objects or free blocks), push it out to
* the stream, since aggregation can't be done across operations
* of different types.
*/
sizeof (dmu_replay_record_t)) != 0)
}
/* write a DATA record */
/*
* There's no pre-computed checksum for partial-block
* writes or embedded BP's, so (like
* fletcher4-checkummed blocks) userland will have to
* compute a dedup-capable checksum itself.
*/
} else {
}
return (0);
}
static int
{
char buf[BPE_PAYLOAD_SIZE];
struct drr_write_embedded *drrw =
sizeof (dmu_replay_record_t)) != 0)
return (EINTR);
}
return (EINTR);
return (EINTR);
return (0);
}
static int
{
sizeof (dmu_replay_record_t)) != 0)
}
/* write a SPILL record */
return (0);
}
static int
{
/* See comment in dump_free(). */
if (!dsp->dsa_incremental)
return (0);
/*
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
* push it out, since free block aggregation can only be done for
* blocks of the same type (i.e., DRR_FREE records can only be
* aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
* can only be aggregated with other DRR_FREEOBJECTS records.
*/
sizeof (dmu_replay_record_t)) != 0)
}
/*
* See whether this free object array can be aggregated
* with pending one
*/
return (0);
} else {
/* can't be aggregated. Push out pending record */
sizeof (dmu_replay_record_t)) != 0)
}
}
/* write a FREEOBJECTS record */
return (0);
}
static int
{
sizeof (dmu_replay_record_t)) != 0)
}
/* write an OBJECT record */
/* Free anything past the end of the file. */
return (0);
}
static boolean_t
{
if (!BP_IS_EMBEDDED(bp))
return (B_FALSE);
/*
* Compression function must be legacy, or explicitly enabled.
*/
return (B_FALSE);
/*
* Embed type must be explicitly enabled.
*/
switch (BPE_GET_ETYPE(bp)) {
case BP_EMBEDDED_TYPE_DATA:
return (B_TRUE);
break;
default:
return (B_FALSE);
}
return (B_FALSE);
}
/* ARGSUSED */
static int
{
int err = 0;
return (0);
/*
* If we are sending a non-snapshot (which is allowed on
* read-only pools), it may have a ZIL, which must be ignored.
*/
return (0);
} else if (BP_IS_HOLE(bp) &&
} else if (BP_IS_HOLE(bp)) {
return (0);
} else if (type == DMU_OT_DNODE) {
int i;
for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
(DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
if (err != 0)
break;
}
/* it's an embedded level-0 block of a regular object */
} else { /* it's a level-0 block of a regular object */
if (zfs_send_corrupt_data) {
/* Send a block filled with 0x"zfs badd bloc" */
ptr++)
*ptr = 0x2f5baddb10cULL;
} else {
}
}
if (!(dsp->dsa_featureflags &
offset += n;
buf += n;
blksz -= n;
}
} else {
}
}
return (err);
}
/*
* Releases dp using the specified tag.
*/
static int
{
int err;
uint64_t featureflags = 0;
if (err != 0) {
return (err);
}
#ifdef _KERNEL
}
if (version >= ZPL_VERSION_SA) {
}
}
#endif
if (embedok &&
} else {
}
if (is_clone)
}
if (!ds->ds_is_snapshot) {
}
goto out;
}
if (err != 0) {
goto out;
}
goto out;
}
out:
return (err);
}
int
{
dsl_pool_t *dp;
int err;
if (err != 0)
return (err);
if (err != 0) {
return (err);
}
if (fromsnap != 0) {
if (err != 0) {
return (err);
}
} else {
}
return (err);
}
int
{
dsl_pool_t *dp;
int err;
if (err != 0)
return (err);
/*
* We are sending a filesystem or volume. Ensure
* that it doesn't change by owning the dataset.
*/
} else {
}
if (err != 0) {
return (err);
}
/*
* If the fromsnap is in a different filesystem, then
* mark the send stream as a clone.
*/
}
if (err == 0) {
}
} else {
}
if (err != 0) {
return (err);
}
} else {
}
if (owned)
else
return (err);
}
static int
{
int err;
/*
* Assume that space (both on-disk and in-stream) is dominated by
* data. We will adjust for indirect blocks and the copies property,
* but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
*/
/*
* Subtract out approximate space used by indirect blocks.
* Assume most space is used by data blocks (non-indirect, non-dnode).
* Assume all blocks are recordsize. Assume ditto blocks and
* internal fragmentation counter out compression.
*
* Therefore, space used by indirect blocks is sizeof(blkptr_t) per
* block, which we observe in practice.
*/
if (err != 0)
return (err);
/* Add in the space for the record associated with each block. */
return (0);
}
int
{
int err;
/* tosnap must be a snapshot */
if (!ds->ds_is_snapshot)
/*
* fromsnap must be an earlier snapshot from the same fs as tosnap,
* or the origin's fs.
*/
/* Get uncompressed size estimate of changed data. */
} else {
if (err != 0)
return (err);
}
return (err);
}
/*
* Simple callback used to traverse the blocks of a snapshot and sum their
* uncompressed size
*/
/* ARGSUSED */
static int
{
}
return (0);
}
/*
* Given a desination snapshot and a TXG, calculate the approximate size of a
* send stream sent from that TXG. from_txg may be zero, indicating that the
* whole snapshot will be sent.
*/
int
{
int err;
/* tosnap must be a snapshot */
if (!dsl_dataset_is_snapshot(ds))
/* verify that from_txg is before the provided snapshot was taken */
}
/*
* traverse the blocks of the snapshot with birth times after
* from_txg, summing their uncompressed size
*/
if (err)
return (err);
return (err);
}
typedef struct dmu_recv_begin_arg {
const char *drba_origin;
static int
{
int error;
/* temporary clone name must not exist */
/* new snapshot name must not exist */
/*
* Check snapshot limit before receiving. We'll recheck again at the
* end, but might as well abort before receiving if we're already over
* the limit.
*
* Note that we do not check the file system limit with
* dsl_dir_fscount_check because the temporary %clones don't count
* against that limit.
*/
if (error != 0)
return (error);
if (fromguid != 0) {
/* Find snapshot in this dir that matches fromguid. */
while (obj != 0) {
&snap);
if (error != 0)
}
break;
}
if (obj == 0)
} else {
/*
* If we are not forcing, there must be no
* changes since fromsnap.
*/
}
}
} else {
/* if full, most recent snapshot must be $ORIGIN */
}
return (0);
}
static int
{
int error;
/* already checked */
/* Verify pool version supports SA if SA_SPILL feature set */
if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
/*
* The receiving code doesn't know how to translate a WRITE_EMBEDDED
* record to a plan WRITE record, so the pool must have the
* EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
* records. Same with WRITE_EMBEDDED records that use LZ4 compression.
*/
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
/*
* The receiving code doesn't know how to translate large blocks
* to smaller ones, so the pool must have the LARGE_BLOCKS
* feature enabled if the stream has LARGE_BLOCKS.
*/
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
if (error == 0) {
/* target fs already exists; recv into temp clone */
/* Can't recv a clone into an existing fs */
if (flags & DRR_FLAG_CLONE) {
}
/* target fs does not exist; must be a full backup or clone */
char buf[MAXNAMELEN];
/*
* If it's a non-clone incremental, we are missing the
* target fs, so fail the recv.
*/
/* Open the parent of tofs */
if (error != 0)
return (error);
/*
* Check filesystem and snapshot limits before receiving. We'll
* recheck snapshot limits again at the end (we create the
* filesystems and increment those counts during begin_sync).
*/
if (error != 0) {
return (error);
}
if (error != 0) {
return (error);
}
if (error != 0) {
return (error);
}
if (!origin->ds_is_snapshot) {
}
}
}
error = 0;
}
return (error);
}
static void
{
int error;
DS_FLAG_CI_DATASET : 0;
if (error == 0) {
/* create temporary clone */
if (drba->drba_snapobj != 0) {
}
} else {
const char *tail;
}
/* Create new dataset. */
}
!newds->ds_large_blocks) {
}
/*
* If we actually created a non-clone, we need to create the
* objset in our new dataset.
*/
}
}
/*
* NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
* succeeds; otherwise we will leak the holds on the datasets.
*/
int
{
dmu_recv_begin_arg_t drba = { 0 };
if (drc->drc_byteswap) {
} else {
}
if (drc->drc_byteswap) {
}
}
struct restorearg {
int err;
char *buf;
int bufsize; /* amount of memory allocated for buf */
};
typedef struct guid_map_entry {
static int
{
return (-1);
return (1);
return (0);
}
static void
free_guid_map_onexit(void *arg)
{
}
}
static void *
{
int done = 0;
/* some things will require 8-byte alignment, so everything must */
return (NULL);
}
else
return (buf);
}
static void
{
case DRR_BEGIN:
break;
case DRR_OBJECT:
break;
case DRR_FREEOBJECTS:
break;
case DRR_WRITE:
break;
case DRR_WRITE_BYREF:
break;
case DRR_WRITE_EMBEDDED:
break;
case DRR_FREE:
break;
case DRR_SPILL:
break;
case DRR_END:
break;
}
}
static inline uint8_t
{
if (bonus_type == DMU_OT_SA) {
return (1);
} else {
return (1 +
}
}
static int
{
int err;
}
if (drro->drr_bonuslen) {
}
/*
* If we are losing blkptrs or changing the block size this must
* be a new file instance. We must clear out the previous file
* contents before we can change this type of metadata in the dnode.
*/
if (err == 0) {
int nblkptr;
drro->drr_bonuslen);
0, DMU_OBJECT_END);
if (err != 0)
}
}
if (err != 0) {
return (err);
}
if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
/* currently allocated, but with different properties */
}
if (err != 0) {
}
tx);
drro->drr_bonuslen);
}
}
return (0);
}
/* ARGSUSED */
static int
struct drr_freeobjects *drrfo)
{
int err;
continue;
if (err != 0)
return (err);
}
return (0);
}
static int
{
void *data;
int err;
}
if (err != 0) {
return (err);
}
}
return (0);
}
/*
* Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
* streams to refer to a copy of the data that is already on the
* system because it came in earlier in the stream. This function
* finds the earlier copy of the data, and uses that copy instead of
* data from the stream to fulfill this write.
*/
static int
struct drr_write_byref *drrwbr)
{
int err;
/*
* If the GUID of the referenced dataset is different from the
* GUID of the target dataset, find the referenced dataset.
*/
}
} else {
}
if (err != 0)
return (err);
if (err != 0) {
return (err);
}
return (0);
}
static int
struct drr_write_embedded *drrwnp)
{
int err;
void *data;
return (EINVAL);
return (EINVAL);
return (EINVAL);
return (EINVAL);
if (err != 0) {
return (err);
}
return (0);
}
static int
{
void *data;
int err;
return (err);
}
if (err != 0) {
return (err);
}
return (0);
}
/* ARGSUSED */
static int
{
int err;
return (err);
}
/* used to destroy the drc_ds on error */
static void
{
char name[MAXNAMELEN];
(void) dsl_destroy_head(name);
}
/*
* NB: callers *must* call dmu_recv_end() if this succeeds.
*/
int
{
struct restorearg ra = { 0 };
int featureflags;
/* these were verified in dmu_recv_begin */
/*
* Open the objset we are modifying.
*/
/* if this stream is dedup'ed, set up the avl tree for guid mapping */
if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
if (cleanup_fd == -1) {
goto out;
}
cleanup_fd = -1;
goto out;
}
if (*action_handlep == 0) {
sizeof (guid_map_entry_t),
goto out;
} else {
(void **)&ra.guid_to_ds_map);
goto out;
}
}
/*
* Read records and process them.
*/
goto out;
}
case DRR_OBJECT:
{
/*
* We need to make a copy of the record header,
* because restore_{object,write} may need to
* restore_read(), which will invalidate drr.
*/
break;
}
case DRR_FREEOBJECTS:
{
struct drr_freeobjects drrfo =
break;
}
case DRR_WRITE:
{
break;
}
case DRR_WRITE_BYREF:
{
struct drr_write_byref drrwbr =
break;
}
case DRR_WRITE_EMBEDDED:
{
struct drr_write_embedded drrwe =
break;
}
case DRR_FREE:
{
break;
}
case DRR_END:
{
/*
* We compare against the *previous* checksum
* value, because the stored checksum is of
* everything before the DRR_END record.
*/
goto out;
}
case DRR_SPILL:
{
break;
}
default:
goto out;
}
}
out:
/*
* destroy what we created, so we don't leave it in the
* inconsistent restoring state.
*/
}
}
static int
{
int error;
if (error != 0)
return (error);
/*
* We will destroy any snapshots in tofs (i.e. before
* origin_head) that are after the origin (which is
* the snap before drc_ds, because drc_ds can not
* have any snaps of its own).
*/
while (obj !=
&snap);
if (error != 0)
return (error);
if (error == 0) {
}
if (error != 0)
return (error);
}
}
if (error != 0) {
return (error);
}
if (error != 0)
return (error);
} else {
}
return (error);
}
static void
{
&origin_head));
/*
* Destroy any snapshots of drc_tofs (origin_head)
* after the origin (the snap before drc_ds).
*/
while (obj !=
&snap));
}
}
origin_head, tx);
/* set snapshot's creation time and guid */
} else {
/* set snapshot's creation time and guid */
}
/*
* Release the hold from dmu_recv_begin. This must be done before
* we return to open context, so that when we free the dataset's dnode,
* we can evict its bonus buffer.
*/
}
static int
{
dsl_pool_t *dp;
int err;
if (err != 0)
return (err);
if (err == 0) {
} else {
}
return (err);
}
static int dmu_recv_end_modified_blocks = 3;
static int
{
int error;
char name[MAXNAMELEN];
#ifdef _KERNEL
/*
* We will be destroying the ds; make sure its origin is unmounted if
* necessary.
*/
#endif
if (error != 0)
return (error);
}
static int
{
int error;
if (error != 0) {
}
return (error);
}
int
{
return (dmu_recv_new_end(drc));
else
return (dmu_recv_existing_end(drc));
}
/*
* Return TRUE if this objset is currently being received into.
*/
{
}