dmu.c revision d5285cae913f4e01ffa0e6693a6d8ef1fbea30ba
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#include <sys/dmu_impl.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_prop.h>
#include <sys/dmu_zfetch.h>
#include <sys/zfs_ioctl.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#ifdef _KERNEL
#include <sys/zfs_znode.h>
#endif
/*
*/
int zfs_nopwrite_enabled = 1;
};
{ byteswap_uint8_array, "uint8" },
{ byteswap_uint16_array, "uint16" },
{ byteswap_uint32_array, "uint32" },
{ byteswap_uint64_array, "uint64" },
{ zap_byteswap, "zap" },
{ dnode_buf_byteswap, "dnode" },
{ dmu_objset_byteswap, "objset" },
{ zfs_znode_byteswap, "znode" },
{ zfs_oldacl_byteswap, "oldacl" },
{ zfs_acl_byteswap, "acl" }
};
int
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
if (err)
return (err);
} else {
if (err) {
}
}
return (err);
}
int
dmu_bonus_max(void)
{
return (DN_MAX_BONUSLEN);
}
int
{
int error;
} else {
error = 0;
}
return (error);
}
int
{
int error;
if (!DMU_OT_IS_VALID(type)) {
} else {
error = 0;
}
return (error);
}
{
return (type);
}
int
{
int error;
return (error);
}
/*
* returns ENOENT, EIO, or 0.
*/
int
{
int error;
if (error)
return (error);
}
/* as long as the bonus buf is held, the dnode will be held */
}
/*
* Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
* hold and incrementing the dbuf count to ensure that dnode_move() sees
* a dnode hold for every dbuf.
*/
return (0);
}
/*
* returns ENOENT, EIO, or 0.
*
* This interface will allocate a blank spill dbuf when a spill blk
* doesn't already exist on the dnode.
*
* if you only want to find an already existing spill db, then
* dmu_spill_hold_existing() should be used.
*/
int
{
int err;
if ((flags & DB_RF_HAVESTRUCT) == 0)
if ((flags & DB_RF_HAVESTRUCT) == 0)
if (err == 0)
else
return (err);
}
int
{
int err;
} else {
if (!dn->dn_have_spill) {
} else {
}
}
return (err);
}
int
{
int err;
return (err);
}
/*
* Note: longer-term, we should modify all of the dmu_buf_*() interfaces
* to take a held dnode rather than <os, object> -- the lookup is wasteful,
* and can induce severe lock contention when writing to several files
* whose dnodes are in the same block.
*/
static int
{
int err;
if (dn->dn_datablkshift) {
} else {
zfs_panic_recover("zfs: accessing past end of object "
"%llx/%llx (size=%u access=%llu+%llu)",
return (EIO);
}
nblks = 1;
}
for (i = 0; i < nblks; i++) {
return (EIO);
}
/* initiate async i/o */
if (read) {
}
}
/* wait for async i/o */
/* track read overhead when we are in sync context */
if (err) {
return (err);
}
/* wait for other io to complete */
if (read) {
for (i = 0; i < nblks; i++) {
if (err) {
return (err);
}
}
}
return (0);
}
static int
{
int err;
if (err)
return (err);
return (err);
}
int
{
int err;
return (err);
}
void
{
int i;
if (numbufs == 0)
return;
for (i = 0; i < numbufs; i++) {
if (dbp[i])
}
}
void
{
if (zfs_prefetch_disable)
return;
if (len == 0) { /* they're interested in the bonus buffer */
return;
return;
}
/*
* XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
if (err != 0)
return;
if (dn->dn_datablkshift) {
} else {
}
if (nblks != 0) {
for (i = 0; i < nblks; i++)
}
}
/*
* Get the next "chunk" of file data to free. We traverse the file from
* the end so that the file gets shorter over time (if we crashes in the
* middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects.
*/
static int
{
return (0);
}
int err;
/* find next allocated L1 indirect */
/* if there are no more, then we are done */
return (0);
} else if (err) {
return (err);
}
blkcnt += 1;
/* reset offset to end of "next" block back */
else
*start -= 1;
}
return (0);
}
static int
{
end = object_size;
return (0);
while (length) {
/* assert(offset <= start) */
if (err)
return (err);
if (err) {
return (err);
}
if (start == 0 && free_dnode) {
}
}
return (0);
}
int
{
int err;
if (err != 0)
return (err);
return (err);
}
int
{
int err;
if (err != 0)
return (err);
if (err == 0) {
} else {
}
} else {
}
return (err);
}
int
{
if (err)
return (err);
return (0);
}
int
{
if (err)
return (err);
/*
* Deal with odd block sizes, where there can't be data past the first
* block. If we ever do the tail block optimization, we will need to
* handle that here as well.
*/
if (dn->dn_maxblkid == 0) {
}
while (size > 0) {
int i;
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
if (err)
break;
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
}
}
return (err);
}
void
{
int numbufs, i;
if (size == 0)
return;
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
else
}
}
void
{
int numbufs, i;
if (size == 0)
return;
for (i = 0; i < numbufs; i++) {
}
}
/*
* DMU support for xuio
*/
int
{
else
return (0);
}
void
{
else
}
/*
* Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
* and increase priv->next by 1.
*/
int
{
return (0);
}
int
{
}
{
}
void
{
}
static void
xuio_stat_init(void)
{
}
}
static void
xuio_stat_fini(void)
{
}
}
void
{
}
void
{
}
#ifdef _KERNEL
int
{
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
if (xuio) {
if (!err) {
}
else
} else {
}
if (err)
break;
}
return (err);
}
static int
{
int numbufs;
int err = 0;
int i;
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
else
/*
* XXX uiomove could block forever (eg. nfs-backed
* pages). There needs to be a uiolockdown() function
* to lock the pages in memory, so that uiomove won't
* block.
*/
if (err)
break;
}
return (err);
}
int
{
int err;
if (size == 0)
return (0);
return (err);
}
int
{
int err;
if (size == 0)
return (0);
if (err)
return (err);
return (err);
}
int
{
int numbufs, i;
int err;
if (size == 0)
return (0);
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int bufoff;
else
}
}
return (err);
}
#endif
/*
* Allocate a loaned anonymous arc buffer.
*/
{
}
/*
* Free a loaned arc buffer.
*/
void
{
}
/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
* dmu_write().
*/
void
{
} else {
}
}
typedef struct {
/* ARGSUSED */
static void
{
if (BP_IS_HOLE(bp)) {
/*
* A block of zeros may compress to a hole, but the
* block size still needs to be known for replay.
*/
} else {
}
}
}
static void
{
}
/* ARGSUSED */
static void
{
}
} else {
}
}
static void
{
/*
* If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
* then there is nothing to do here. Otherwise, free the
* newly allocated block in this txg.
*/
} else {
}
}
}
static int
{
return (EIO); /* Make zl_get_data do txg_waited_synced() */
}
return (0);
}
/*
* Intent log support: sync the block associated with db to disk.
* N.B. and XXX: the caller is responsible for making sure that the
* data isn't changing while dmu_sync() is writing it.
*
* Return values:
*
* EEXIST: this txg has already been synced, so there's nothing to do.
* The caller should not log the write.
*
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
* The caller should not log the write.
*
* EALREADY: this block is already in the process of being synced.
* The caller should track its progress (somehow).
*
* EIO: could not do the I/O.
* The caller should do a txg_wait_synced().
*
* 0: the I/O has been initiated.
* The caller should log this blkptr in the done callback.
* It is possible that the I/O will fail, in which case
* the error will be reported to the done callback and
* propagated to pio from zio_done().
*/
int
{
/*
* If we're frozen (running ziltest), we always need to generate a bp.
*/
/*
* Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
* and us. If we determine that this txg is not yet syncing,
* but it begins to sync a moment later, that's OK because the
* sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
*/
/*
* This txg has already synced. There's nothing to do.
*/
return (EEXIST);
}
/*
* This txg is currently syncing, so we can't mess with
* the dirty record anymore; just write a new log block.
*/
}
/*
* There's no dr for this dbuf, so it must have been freed.
* There's no need to log writes to freed blocks, so we're done.
*/
return (ENOENT);
}
/*
* Assume the on-disk data is X, the current syncing data is Y,
* and the current in-memory data is Z (currently in dmu_sync).
* X and Z are identical but Y is has been modified. Normally,
* when X and Z are the same we will perform a nopwrite but if Y
* is different we must disable nopwrite since the resulting write
* of Y to disk can free the block containing X. If we allowed a
* nopwrite to occur the block pointing to Z would reference a freed
* block. Since this is a rare case we simplify this by disabling
* nopwrite if the current dmu_sync-ing dbuf has been modified in
* a previous transaction.
*/
/*
* We have already issued a sync write for this buffer,
* or this buffer has already been synced. It could not
* have been dirtied since, or we would have cleared the state.
*/
return (EALREADY);
}
return (0);
}
int
{
int err;
if (err)
return (err);
return (err);
}
void
{
/* XXX assumes dnode_hold will not get an i/o error */
}
void
{
/* XXX assumes dnode_hold will not get an i/o error */
}
int zfs_mdcomp_disable = 0;
void
{
/*
* We maintain different write policies for each of the following
* types of data:
* 1. metadata
* 2. preallocated blocks (i.e. level-0 blocks of a dump device)
* 3. all other level 0 blocks
*/
if (ismd) {
/*
* XXX -- we should design a compression algorithm
* that specializes in arrays of bps.
*/
/*
* Metadata always gets checksummed. If the data
* checksum is multi-bit correctable, and it's not a
* ZBT-style checksum, then it's suitable for metadata
* as well. Otherwise, the metadata checksum defaults
* to fletcher4.
*/
/*
* If we're writing preallocated blocks, we aren't actually
* writing them so don't set any policy properties. These
* blocks are currently only used by an external subsystem
* outside of zfs (i.e. dump) and not written by the zio
* pipeline.
*/
} else {
/*
* Determine dedup setting. If we are in dmu_sync(),
* we won't actually dedup now because that's all
* done in syncing context; but we do want to use the
* dedup checkum. If the checksum is not strong
* enough to ensure unique signatures, force
* dedup_verify.
*/
if (dedup_checksum != ZIO_CHECKSUM_OFF) {
}
/*
* Enable nopwrite if we have a cryptographically secure
* checksum that has no known collisions (i.e. SHA-256)
* and compression is enabled. We don't enable nopwrite if
* dedup is enabled as the two features are mutually exclusive.
*/
}
}
int
{
int i, err;
if (err)
return (err);
/*
* Sync any current changes before
* we go trundling through the block pointers.
*/
for (i = 0; i < TXG_SIZE; i++) {
break;
}
if (i != TXG_SIZE) {
if (err)
return (err);
}
return (err);
}
void
{
doi->doi_fill_count = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
}
/*
* Get information on a DMU object.
* If doi is NULL, just indicates whether the object exists.
*/
int
{
if (err)
return (err);
return (0);
}
/*
* As above, but faster; can be used when you have a held dbuf in hand.
*/
void
{
}
/*
* Faster still when you only care about the size.
* This is specifically optimized for zfs_getattr().
*/
void
{
/* add 1 for dnode space */
SPA_MINBLOCKSHIFT) + 1;
}
void
{
int i;
for (i = 0; i < count; i++)
}
void
{
int i;
for (i = 0; i < count; i++)
}
void
{
int i;
for (i = 0; i < count; i++)
}
/* ARGSUSED */
void
{
}
void
dmu_init(void)
{
dnode_init();
dbuf_init();
zfetch_init();
l2arc_init();
arc_init();
}
void
dmu_fini(void)
{
arc_fini();
l2arc_fini();
zfetch_fini();
dbuf_fini();
dnode_fini();
}