dnode.c revision a846f19d279fdfb0e0d63f78ccaf0205a88274d2
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dmu_zfetch.h>
#include <sys/range_tree.h>
static kmem_cache_t *dnode_cache;
/*
* Define DNODE_STATS to turn on statistic gathering. By default, it is only
* turned on when DEBUG is also defined.
*/
#ifdef DEBUG
#define DNODE_STATS
#endif /* DEBUG */
#ifdef DNODE_STATS
#else
#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
int zfs_default_bs = SPA_MINBLOCKSHIFT;
static int
{
return (-1);
}
return (1);
}
return (-1);
}
return (1);
}
return (-1);
return (1);
}
return (-1);
}
return (1);
}
return (0);
}
/* ARGSUSED */
static int
{
int i;
/*
* Every dbuf has a reference, and dropping a tracked reference is
* O(number of references), so don't track dn_holds.
*/
for (i = 0; i < TXG_SIZE; i++) {
sizeof (dbuf_dirty_record_t),
}
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
dn->dn_dirtyctx = 0;
dn->dn_oldused = 0;
dn->dn_oldflags = 0;
dn->dn_id_flags = 0;
dn->dn_dbufs_count = 0;
dn->dn_unlisted_l0_blkid = 0;
return (0);
}
/* ARGSUSED */
static void
{
int i;
for (i = 0; i < TXG_SIZE; i++) {
}
}
void
dnode_init(void)
{
sizeof (dnode_t),
}
void
dnode_fini(void)
{
dnode_cache = NULL;
}
#ifdef ZFS_DEBUG
void
{
int drop_struct_lock = FALSE;
if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
return;
}
int i;
if (dn->dn_datablkshift) {
}
for (i = 0; i < TXG_SIZE; i++) {
}
}
}
if (drop_struct_lock)
}
#endif
void
{
int i;
return;
}
/*
* dn_nblkptr is only one byte, so it's OK to read it in either
* byte order. We can't read dn_bouslen.
*/
/*
* OK to check dn_bonuslen for zero, because it won't matter if
* we have the wrong byte order. This is necessary because the
* dnode dnode is smaller than a regular dnode.
*/
if (dnp->dn_bonuslen != 0) {
/*
* Note that the bonus length calculated here may be
* longer than the actual bonus buffer. This is because
* we always put the bonus buffer after the last block
* pointer (instead of packing it against the end of the
* dnode buffer).
*/
}
/* Swap SPILL block if we have one */
}
void
{
int i;
size >>= DNODE_SHIFT;
for (i = 0; i < size; i++) {
buf++;
}
}
void
{
if (newsize == 0)
else
}
void
{
}
void
{
}
static void
{
}
static dnode_t *
{
/*
* Defer setting dn_objset until the dnode is ready to be a candidate
* for the dnode_move() callback.
*/
if (dnp->dn_datablkszsec) {
} else {
dn->dn_datablksz = 0;
dn->dn_datablkszsec = 0;
dn->dn_datablkshift = 0;
}
dn->dn_id_flags = 0;
/*
* Everything else must be valid before assigning dn_objset makes the
* dnode eligible for dnode_move().
*/
return (dn);
}
/*
* Caller must be holding the dnode handle, which is released upon return.
*/
static void
{
/* the dnode can no longer move, so we can release the handle */
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
dn->dn_dirtyctx = 0;
}
}
dn->dn_oldused = 0;
dn->dn_oldflags = 0;
dn->dn_id_flags = 0;
dn->dn_unlisted_l0_blkid = 0;
}
void
{
int i;
if (blocksize == 0)
else
if (ibs == 0)
for (i = 0; i < TXG_SIZE; i++) {
}
else
dn->dn_dirtyctx = 0;
dn->dn_free_txg = 0;
if (dn->dn_dirtyctx_firstset) {
}
dn->dn_id_flags = 0;
}
void
{
int nblkptr;
/* clean up any unreferenced dbufs */
dn->dn_id_flags = 0;
/* change blocksize */
dnode_block_freed(dn, 0)));
}
nblkptr = 1;
else
}
/* change type */
/* change bonus size and type */
/* fix up the bonus db_size */
}
}
#ifdef DNODE_STATS
static struct {
#endif /* DNODE_STATS */
static void
{
int i;
/* Copy fields. */
sizeof (odn->dn_next_nblkptr));
sizeof (odn->dn_next_nlevels));
sizeof (odn->dn_next_indblkshift));
sizeof (odn->dn_next_bonustype));
sizeof (odn->dn_rm_spillblk));
sizeof (odn->dn_next_bonuslen));
sizeof (odn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
&odn->dn_dirty_records[i]);
}
sizeof (odn->dn_free_ranges));
/*
* Update back pointers. Updating the handle fixes the back pointer of
* every descendant dbuf as well as the bonus dbuf.
*/
}
/*
* Invalidate the original dnode by clearing all of its back pointers.
*/
odn->dn_dbufs_count = 0;
odn->dn_unlisted_l0_blkid = 0;
/*
* Set the low bit of the objset pointer to ensure that dnode_move()
* recognizes the dnode as invalid in any subsequent callback.
*/
/*
* Satisfy the destructor.
*/
for (i = 0; i < TXG_SIZE; i++) {
sizeof (dbuf_dirty_record_t),
odn->dn_next_nlevels[i] = 0;
odn->dn_next_indblkshift[i] = 0;
odn->dn_next_bonustype[i] = 0;
odn->dn_rm_spillblk[i] = 0;
odn->dn_next_bonuslen[i] = 0;
odn->dn_next_blksz[i] = 0;
}
odn->dn_allocated_txg = 0;
odn->dn_free_txg = 0;
odn->dn_assigned_txg = 0;
odn->dn_dirtyctx = 0;
odn->dn_oldused = 0;
odn->dn_oldflags = 0;
odn->dn_id_flags = 0;
/*
* Mark the dnode.
*/
}
#ifdef _KERNEL
/*ARGSUSED*/
static kmem_cbrc_t
{
/*
* The dnode is on the objset's list of known dnodes if the objset
* pointer is valid. We set the low bit of the objset pointer when
* freeing the dnode to invalidate it, and the memory patterns written
* by kmem (baddcafe and deadbeef) set at least one of the two low bits.
* A newly created dnode sets the objset pointer last of all to indicate
* that the dnode is known and in a valid state to be moved by this
* function.
*/
if (!POINTER_IS_VALID(os)) {
return (KMEM_CBRC_DONT_KNOW);
}
/*
* Ensure that the objset does not go away during the move.
*/
return (KMEM_CBRC_DONT_KNOW);
}
/*
* If the dnode is still valid, then so is the objset. We know that no
* valid objset can be freed while we hold os_lock, so we can safely
* ensure that the objset remains in use.
*/
/*
* Recheck the objset pointer in case the dnode was removed just before
* acquiring the lock.
*/
return (KMEM_CBRC_DONT_KNOW);
}
/*
* At this point we know that as long as we hold os->os_lock, the dnode
* cannot be freed and fields within the dnode can be safely accessed.
* The objset listing this dnode cannot go away as long as this dnode is
* on its list.
*/
return (KMEM_CBRC_NO);
}
/*
* Lock the dnode handle to prevent the dnode from obtaining any new
* holds. This also prevents the descendant dbufs and the bonus dbuf
* from accessing the dnode, so that we can discount their holds. The
* handle is safe to access because we know that while the dnode cannot
* go away, neither can its handle. Once we hold dnh_zrlock, we can
* safely move any dnode referenced only by dbufs.
*/
return (KMEM_CBRC_LATER);
}
/*
* Ensure a consistent view of the dnode's holds and the dnode's dbufs.
* We need to guarantee that there is a hold for every dbuf in order to
* determine whether the dnode is actively referenced. Falsely matching
* a dbuf to an active hold would lead to an unsafe move. It's possible
* that a thread already having an active dnode hold is about to add a
* dbuf, and we can't compare hold and dbuf counts while the add is in
* progress.
*/
return (KMEM_CBRC_LATER);
}
/*
* A dbuf may be removed (evicted) without an active dnode hold. In that
* case, the dbuf count is decremented under the handle lock before the
* dbuf's hold is released. This order ensures that if we count the hold
* after the dbuf is removed but before its hold is released, we will
* treat the unmatched hold as active and exit safely. If we count the
* hold before the dbuf is removed, the hold is discounted, and the
* removal is blocked until the move completes.
*/
/* We can't have more dbufs than dnode holds. */
return (KMEM_CBRC_LATER);
}
/*
* At this point we know that anyone with a hold on the dnode is not
* actively referencing it. The dnode is known and in a valid state to
* move. We're holding the locks needed to execute the critical section.
*/
/* If the dnode was safe to move, the refcount cannot have changed. */
return (KMEM_CBRC_YES);
}
#endif /* _KERNEL */
void
{
/*
* Wait for final references to the dnode to clear. This can
* only happen if the arc is asyncronously evicting state that
* has a hold on this dnode while we are trying to evict this
* dnode.
*/
delay(1);
}
dnode_t *
{
return (dn);
}
static void
{
int i;
for (i = 0; i < epb; i++) {
/*
* The dnode handle lock guards against the dnode moving to
* another valid address, so there is no need here to guard
* against changes to or from NULL.
*/
continue;
}
/*
* If there are holds on this dnode, then there should
* be holds on the dnode's containing dbuf as well; thus
* it wouldn't be eligible for eviction and this function
* would not have been called.
*/
}
epb * sizeof (dnode_handle_t));
}
/*
* errors:
* EINVAL - invalid object number.
* EIO - i/o error.
* succeeds even for free dnodes.
*/
int
{
int drop_struct_lock = FALSE;
int type;
/*
* If you are holding the spa config lock as writer, you shouldn't
* be asking the DMU to do *anything* unless it's the root pool
* which may require us to read from the root filesystem while
* holding some (not all) of the locks as writer.
*/
return (0);
}
}
if (drop_struct_lock)
if (err) {
return (err);
}
if (children_dnodes == NULL) {
int i;
for (i = 0; i < epb; i++) {
}
for (i = 0; i < epb; i++) {
}
epb * sizeof (dnode_handle_t));
}
}
}
}
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_FREE) &&
}
/* Now we can rely on the hold to prevent the dnode from moving. */
return (0);
}
/*
* Return held dnode if the object is allocated, NULL if not.
*/
int
{
}
/*
* Can only add a reference if there is already at least one
* reference on the dnode. Returns FALSE if unable to add a
* new reference.
*/
{
return (FALSE);
}
return (TRUE);
}
void
{
/* Get while the hold prevents the dnode from moving. */
/*
* It's unsafe to release the last hold on a dnode by dnode_rele() or
* indirectly by dbuf_rele() while relying on the dnode handle to
* prevent the dnode from moving, since releasing the last hold could
* result in the dnode's parent dbuf evicting its dnode handles. For
* that reason anyone calling dnode_rele() or dbuf_rele() without some
* other direct or indirect hold on the dnode must first drop the dnode
* handle.
*/
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
/*
* Another thread could add a hold to the dnode handle in
* dnode_hold_impl() while holding the parent dbuf. Since the
* hold on the parent dbuf prevents the handle from being
* destroyed, the hold on the handle is OK. We can't yet assert
* that the handle has zero references, but that will be
* asserted anyway when the handle gets destroyed.
*/
}
}
void
{
return;
}
#ifdef ZFS_DEBUG
#endif
/*
*/
/*
* If we are already marked dirty, we're done.
*/
return;
}
} else {
}
/*
* The dnode maintains a hold on its containing dbuf as
* long as there are holds on it. Each instantiated child
* dbuf maintains a hold on the dnode. When the last child
* drops its hold, the dnode will drop its hold on the
* containing dbuf. We add a "dirty hold" here so that the
* dnode will hang around after we finish processing its
* children.
*/
}
void
{
/* we should be the only holder... hopefully */
/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
return;
}
/*
* If the dnode is already dirty, it needs to be moved from
* the dirty list to the free list.
*/
} else {
}
}
/*
* Try to change the block size for the indicated dnode. This can only
* succeed if there are no blocks allocated or dirty beyond first block
*/
int
{
int err;
if (size == 0)
else
ibs = 0;
return (0);
/* Check for any allocated blocks beyond the first */
if (dn->dn_maxblkid != 0)
goto fail;
goto fail;
}
}
goto fail;
/* resize the old block */
if (err == 0)
goto fail;
if (ibs) {
}
/* rele after we have fixed the blocksize in the dnode */
if (db)
return (0);
fail:
}
/* read-holding callers must not rely on the lock being continuously held */
void
{
int epbs, new_nlevels;
/*
* if we have a read-lock, check to see if we need to do any work
* before upgrading to a write-lock.
*/
if (have_read) {
return;
}
}
goto out;
/*
* Compute the number of levels necessary to support the new maxblkid.
*/
new_nlevels = 1;
new_nlevels++;
/* dirty the left indirects */
/* transfer the dirty records to the new indirect */
}
}
}
out:
if (have_read)
}
void
{
int epbs;
if (len == DMU_OBJECT_END) {
}
/*
* First, block align the region to free:
*/
goto out;
} else {
/*
* Freeing the whole block; fast-track this request.
* Note that we won't dirty any indirect blocks,
* which is fine because we will be freeing the entire
* file and thus all indirect blocks will be freed
* by free_children().
*/
blkid = 0;
nblks = 1;
goto done;
/* Freeing past end-of-data */
goto out;
} else {
/* Freeing part of the block. */
}
}
/* zero out any partial block data at the start of the range */
if (head) {
/* don't dirty if it isn't on disk and isn't dirty */
if (db->db_last_dirty ||
}
}
}
/* If the range was less than one block, we're done */
if (len == 0)
goto out;
/* If the remaining range is past end of file, we're done */
goto out;
if (trunc)
tail = 0;
else
/* zero out any partial block data at the end of the range */
if (tail) {
/* don't dirty if not on disk and not dirty */
if (db->db_last_dirty ||
}
}
}
/* If the range did not include a full block, we are done */
if (len == 0)
goto out;
if (trunc)
nblks += 1;
/*
* parents) will need to be written out if they were only
* partially freed. Interior indirect blocks will be themselves freed,
* by free_children(), so they need not be dirtied. Note that these
* interior blocks have already been prefetched by dmu_tx_hold_free().
*/
}
if (trunc)
else
}
}
done:
/*
* Add this range to the dnode range list.
* We will finish up this free operation in the syncing phase.
*/
}
out:
}
static boolean_t
{
int i;
for (i = 0; i < TXG_SIZE; i++) {
break;
}
return (i < TXG_SIZE);
}
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
{
int i;
if (blkid == DMU_BONUS_BLKID)
return (FALSE);
/*
* If we're in the process of opening the pool, dp will not be
* set yet, but there shouldn't be anything dirty.
*/
return (FALSE);
if (dn->dn_free_txg)
return (TRUE);
if (blkid == DMU_SPILL_BLKID)
return (dnode_spill_freed(dn));
for (i = 0; i < TXG_SIZE; i++) {
break;
}
return (i < TXG_SIZE);
}
void
{
(longlong_t)delta);
if (delta > 0) {
} else {
}
} else {
}
}
/*
* the amount of memory in use by the currently open txg.
*/
void
{
}
}
/*
* Scans a block at the indicated "level" looking for a hole or data,
* depending on 'flags'.
*
* If level > 0, then we are scanning an indirect block looking at its
* pointers. If level == 0, then we are looking at a block of dnodes.
*
* If we don't find what we are looking for in the block, we return ESRCH.
* Otherwise, return with *offset pointing to the beginning (if searching
* forwards) or end (if searching backwards) of the range covered by the
* block pointer we matched on (or dnode).
*
* The basic search algorithm used below by dnode_next_offset() is to
* use this function to search up the block tree (widen the search) until
* we find something (i.e., we don't return ESRCH) and then search back
* down the tree (narrow the search) until we reach our original search
* level.
*/
static int
{
dprintf("probing object %llu offset %llx level %d of %u\n",
error = 0;
} else {
if (error) {
return (error);
if (hole)
return (0);
/*
* This can only happen when we are searching up
* the block tree for data. We don't really need to
* adjust the offset, as we will just end up looking
* at the pointer to this block in its parent, and its
* going to be unallocated, so we will skip over it.
*/
}
if (error) {
return (error);
}
}
/*
* This can only happen when we are searching up the tree
* and these conditions mean that we need to keep climbing.
*/
} else if (lvl == 0) {
span = DNODE_SHIFT;
break;
}
if (i < 0 || i == blkfill)
} else {
minfill = 0;
if (hole)
maxfill--;
else
minfill++;
break;
}
if (inc < 0) {
/* traversing backwards; position offset at the end */
}
if (i < 0 || i >= epb)
}
if (db)
return (error);
}
/*
* Find the next hole, data, or sparse region at or after *offset.
* The value 'blkfill' tells us how many items we expect to find
* in an L0 data block; this value is 1 for normal objects,
* DNODES_PER_BLOCK for the meta dnode, and some fraction of
* DNODES_PER_BLOCK when searching for sparse regions thereof.
*
* Examples:
*
* dnode_next_offset(dn, flags, offset, 1, 1, 0);
* Used in dmu_offset_next().
*
* dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
* Only finds objects that have new contents since txg (ie.
* bonus buffer changes and content removal are ignored).
* Used in dmu_object_next().
*
* dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
* Finds the next L2 meta-dnode bp that's at most 1/4 full.
* Used in dmu_object_alloc().
*/
int
{
int error = 0;
if (!(flags & DNODE_FIND_HAVELOCK))
goto out;
}
if (dn->dn_datablkshift == 0) {
if (flags & DNODE_FIND_HOLE)
} else {
}
goto out;
}
break;
}
}
/*
* There's always a "virtual hole" at the end of the object, even
* if all BP's which physically exist are non-holes.
*/
error = 0;
}
out:
if (!(flags & DNODE_FIND_HAVELOCK))
return (error);
}