zio.c revision bbe36defdfa03da1119804d4ca2d48c1fc6c4ce1
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
/*
* ==========================================================================
* I/O priority table
* ==========================================================================
*/
0, /* ZIO_PRIORITY_NOW */
0, /* ZIO_PRIORITY_SYNC_READ */
0, /* ZIO_PRIORITY_SYNC_WRITE */
6, /* ZIO_PRIORITY_ASYNC_READ */
4, /* ZIO_PRIORITY_ASYNC_WRITE */
4, /* ZIO_PRIORITY_FREE */
0, /* ZIO_PRIORITY_CACHE_FILL */
0, /* ZIO_PRIORITY_LOG_WRITE */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
};
/*
* ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
"null", "read", "write", "free", "claim", "ioctl" };
/*
* ==========================================================================
* I/O kmem caches
* ==========================================================================
*/
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
/*
* An allocating zio is one that either currently has the DVA allocate
* stage set or will have it later in its lifetime.
*/
#define IO_IS_ALLOCATING(zio) \
void
zio_init(void)
{
size_t c;
#ifdef _KERNEL
#endif
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
* for each quarter-power of 2. For large buffers, we want
* a cache for each multiple of PAGESIZE.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
}
if (align != 0) {
char name[36];
}
}
while (--c != 0) {
}
}
void
zio_fini(void)
{
size_t c;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
}
zio_buf_cache[c] = NULL;
if (zio_data_buf_cache[c] != last_data_cache) {
}
zio_data_buf_cache[c] = NULL;
}
}
/*
* ==========================================================================
* Allocate and free I/O buffers
* ==========================================================================
*/
/*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
* excess / transient data in-core during a crashdump.
*/
void *
{
}
/*
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
* crashdump if the kernel panics. This exists so that we will limit the amount
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
* of kernel heap dumped to disk when the kernel panics)
*/
void *
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
*/
static void
{
}
static void
{
}
}
/*
* ==========================================================================
* I/O transform callbacks for subblocks and decompression
* ==========================================================================
*/
static void
{
}
static void
{
}
/*
* ==========================================================================
* ==========================================================================
*/
/*
* NOTE - Callers to zio_walk_parents() and zio_walk_children must
* continue calling these functions until they return NULL.
* Otherwise, the next caller will pick up the list walk in
* some indeterminate state. (Otherwise every caller would
* have to pass in a cookie to keep the state represented by
* io_walk_link, which gets annoying.)
*/
zio_t *
{
return (NULL);
}
zio_t *
{
return (NULL);
}
zio_t *
{
return (pio);
}
void
{
/*
* The following ASSERT captures all of these constraints.
*/
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
}
static void
{
}
static boolean_t
{
if (*countp != 0) {
}
return (waiting);
}
static void
{
} else {
}
}
static void
{
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
* ==========================================================================
*/
static zio_t *
{
else if (flags & ZIO_FLAG_GANG_CHILD)
else
if (type != ZIO_TYPE_WRITE)
}
}
return (zio);
}
static void
{
}
zio_t *
{
return (zio);
}
zio_t *
{
}
zio_t *
{
return (zio);
}
void
{
}
zio_t *
{
return (zio);
}
zio_t *
{
return (zio);
}
zio_t *
{
}
return (zio);
}
zio_t *
{
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
* immediate writes contain committed data, but in a txg that was
* *not* committed. Upon opening the pool after an unclean shutdown,
* the intent log claims all blocks that contain immediate write data
* so that the SPA knows they're in use.
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
*/
return (zio);
}
zio_t *
{
int c;
if (vd->vdev_children == 0) {
} else {
for (c = 0; c < vd->vdev_children; c++)
}
return (zio);
}
zio_t *
{
return (zio);
}
zio_t *
{
/*
* zbt checksums are necessarily destructive -- they modify
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
*/
}
return (zio);
}
/*
* Create a child I/O to do some work for us.
*/
zio_t *
{
/*
* If we have the bp, then the child should perform the
* checksum and the parent need not. This pushes error
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
}
if (vd->vdev_children == 0)
return (zio);
}
zio_t *
{
return (zio);
}
void
{
}
/*
* ==========================================================================
* Prepare to read and write logical blocks
* ==========================================================================
*/
static int
{
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
void *cbuf;
int pass = 1;
/*
* If our children haven't all reached the ready stage,
* wait for them and then repeat this pipeline stage.
*/
return (ZIO_PIPELINE_STOP);
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
* converge, it must eventually be the case that we don't
* have to allocate new blocks. But compression changes
* the blocksize, which forces a reallocate, and makes
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
if (pass > SYNC_PASS_DONT_COMPRESS)
/* Make sure someone doesn't change their mind on overwrites */
}
if (compress != ZIO_COMPRESS_OFF) {
} else if (csize != 0) {
}
}
/*
* The final pass of spa_sync() must be all rewrites, but the first
* few passes offer a trade-off: allocating blocks defers convergence,
* but newly allocated blocks are sequential, so they can be written
* to disk faster. Therefore, we allow the first few passes of
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
pass > SYNC_PASS_REWRITE) {
} else {
}
if (csize == 0) {
} else {
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Execute the I/O pipeline
* ==========================================================================
*/
static void
{
/*
* If we're a config writer or a probe, the normal issue and
* interrupt threads may all be blocked waiting for the config lock.
* In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
*/
t = ZIO_TYPE_NULL;
/*
* A similar issue exists for the L2ARC write thread until L2ARC 2.0.
*/
t = ZIO_TYPE_NULL;
}
static boolean_t
{
for (zio_type_t t = 0; t < ZIO_TYPES; t++)
return (B_TRUE);
return (B_FALSE);
}
static int
{
return (ZIO_PIPELINE_STOP);
}
void
{
}
/*
* Execute the I/O pipeline until one of the following occurs:
* (1) the I/O completes; (2) the pipeline stalls waiting for
* for an I/O completion interrupt; (4) the I/O is delegated by
* vdev-level caching or aggregation; (5) the I/O is deferred
* due to vdev-level queueing; (6) the I/O is handed off to
* another thread. In all cases, the pipeline stops whenever
* there's no CPU work; it never burns a thread in cv_wait().
*
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
*/
void
{
int rv;
continue;
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
* issue async to avoid deadlock.
*/
return;
}
if (rv == ZIO_PIPELINE_STOP)
return;
}
}
/*
* ==========================================================================
* Initiate I/O, either sync or async
* ==========================================================================
*/
int
{
int error;
return (error);
}
void
{
/*
* This is a logical async I/O with no parent to wait for it.
* We add it to the spa_async_root_zio "Godfather" I/O which
* will ensure they complete prior to unloading the pool.
*/
}
}
/*
* ==========================================================================
* ==========================================================================
*/
static void
{
pio->io_reexecute = 0;
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
if (IO_IS_ALLOCATING(pio)) {
/*
* Remember the failed bp so that the io_ready() callback
* can update its accounting upon reexecution. The block
* was already freed in zio_done(); we indicate this with
* a fill count of -1 so that zio_free() knows to skip it.
*/
}
/*
* As we reexecute pio's children, new children could be created.
* New children go to the head of pio's io_child_list, however,
* so we will (correctly) not reexecute them. The key is that
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
}
/*
* Now that all children have been reexecuted, execute the parent.
* We don't reexecute "The Godfather" I/O here as it's the
* responsibility of the caller to wait on him.
*/
}
void
{
fm_panic("Pool '%s' has encountered an uncorrectable I/O "
"failure and the failure mode property for this pool "
}
}
int
{
/*
* Reexecute all previously suspended i/o.
*/
return (0);
}
void
{
while (spa_suspended(spa))
}
/*
* ==========================================================================
* Gang blocks.
*
* A gang block is a collection of small blocks that looks to the DMU
* like one large block. When zio_dva_allocate() cannot find a block
* of the requested size, due to either severe fragmentation or the pool
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
* an indirect block: it's an array of block pointers. It consumes
* only one sector and hence is allocatable regardless of fragmentation.
* The gang header's bps point to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
* as the verifier to ensure uniqueness of the SHA256 checksum.
* Critically, the gang block bp's blk_cksum is the checksum of the data,
* not the gang header. This ensures that data block signatures (needed for
* deduplication) are independent of how the block is physically stored.
*
* Gang blocks can be nested: a gang member may itself be a gang block.
* Thus every gang block is a tree in which root and all interior nodes are
* gang headers, and the leaves are normal blocks that contain user data.
* The root of the gang tree is called the gang leader.
*
* To perform any operation (read, rewrite, free, claim) on a gang block,
* zio_gang_assemble() first assembles the gang tree (minus data leaves)
* in the io_gang_tree field of the original logical i/o by recursively
* reading the gang leader and all gang headers below it. This yields
* an in-core tree containing the contents of every gang header and the
* bps for every constituent of the gang block.
*
* With the gang tree now assembled, zio_gang_issue() just walks the gang tree
* and invokes a callback on each bp. To free a gang block, zio_gang_issue()
* calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
* zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
* zio_read_gang() is a wrapper around zio_read() that omits reading gang
* headers, since we already have those in io_gang_tree. zio_rewrite_gang()
* performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
* of the gang header plus zio_checksum_compute() of the data to update the
* gang header's blk_cksum as described above.
*
* what if you'd freed part of a gang block but then couldn't read the
* gang header for another part? Assembling the entire gang tree first
* ensures that all the necessary gang header I/O has succeeded before
* starting the actual work of free, claim, or write. Once the gang tree
* is assembled, free and claim are in-memory operations that cannot fail.
*
* In the event that a gang write fails, zio_dva_unallocate() walks the
* gang tree to immediately free (i.e. insert back into the space map)
* everything we've allocated. This ensures that we don't get ENOSPC
*
* Gang rewrites only happen during sync-to-convergence. If we can't assemble
* the gang tree, we won't modify the block, so we can safely defer the free
* (knowing that the block is still intact). If we *can* assemble the gang
* tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
* each constituent bp and we can allocate a new block on the next sync pass.
*
* In all cases, the gang tree allows complete recovery from partial failure.
* ==========================================================================
*/
static zio_t *
{
return (pio);
&pio->io_bookmark));
}
zio_t *
{
/*
* As we rewrite each gang header, the pipeline will compute
* a new gang block header checksum for it; but no one will
* compute a new data checksum, so we do that here. The one
* exception is the gang leader: the pipeline already computed
* its data checksum because that stage precedes gang assembly.
* (Presently, nothing actually uses interior data checksums;
* this is just good hygiene.)
*/
}
} else {
}
return (zio);
}
/* ARGSUSED */
zio_t *
{
}
/* ARGSUSED */
zio_t *
{
}
NULL,
};
static zio_gang_node_t *
{
return (gn);
}
static void
{
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
}
static void
{
return;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
}
static void
{
}
static void
{
return;
if (BP_SHOULD_BYTESWAP(bp))
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
if (!BP_IS_GANG(gbp))
continue;
}
}
static void
{
/*
* If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL.
*/
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
if (BP_IS_HOLE(gbp))
continue;
}
}
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_STOP);
else
return (ZIO_PIPELINE_CONTINUE);
}
static void
{
return;
}
}
static int
{
int error;
if (error) {
return (ZIO_PIPELINE_CONTINUE);
}
} else {
}
/*
* Create the gang header.
*/
/*
* Create and nowait the gang children.
*/
&pio->io_bookmark));
}
/*
* Set pio's pipeline to just wait for zio to finish.
*/
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Allocate and free blocks
* ==========================================================================
*/
static int
{
int error;
}
if (error) {
return (zio_write_gang_block(zio));
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
int error;
if (error)
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Undo an allocation. This is used by zio_done() when an I/O fails
* and we want to give back the block we just allocated.
* This handles both normal blocks and gang blocks.
*/
static void
{
/*
* This is a rewrite for sync-to-convergence.
* We can't do a metaslab_free(NOW) because bp wasn't allocated
* during this sync pass, which means that metaslab_sync()
* already committed the allocation.
*/
/*
* This is a gang leader whose gang header(s) we
* couldn't read now, so defer the free until later.
* The block should still be intact because without
* the headers, we'd never even start the rewrite.
*/
return;
}
}
if (!BP_IS_HOLE(bp))
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
}
}
}
/*
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
{
int error;
if (error)
if (error == 0) {
BP_SET_LEVEL(new_bp, 0);
}
return (error);
}
/*
* Free an intent log block. We know it can't be a gang block, so there's
* nothing to do except metaslab_free() it.
*/
void
{
}
/*
* ==========================================================================
* Read and write to physical devices
* ==========================================================================
*/
static int
{
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
}
}
}
/*
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
* This prevents spurious resilvering with nested replication.
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
* A is out of date, we'll read from C+D, then use the data to
* resilver A+B -- but we don't actually want to resilver B, just A.
* The top-level mirror has no way to know this, so instead we just
* discard unnecessary repairs as we work our way down the vdev tree.
* The same logic applies to any form of nested replication:
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
*/
return (ZIO_PIPELINE_CONTINUE);
}
return (ZIO_PIPELINE_CONTINUE);
return (ZIO_PIPELINE_STOP);
return (ZIO_PIPELINE_STOP);
}
}
}
static int
{
return (ZIO_PIPELINE_STOP);
} else {
}
}
}
if (unexpected_error)
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_STOP);
}
/*
* If the I/O failed, determine whether we should attempt to retry it.
*/
return (ZIO_PIPELINE_STOP);
}
/*
* If we got an error on a leaf device, convert it to ENXIO
* if the device is not accessible at all.
*/
/*
* If we can't write to an interior vdev (mirror or RAID-Z),
* set vdev_cant_write so that we stop trying to allocate from it.
*/
return (ZIO_PIPELINE_CONTINUE);
}
void
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Generate and verify checksums
* ==========================================================================
*/
static int
{
enum zio_checksum checksum;
/*
* This is zio_write_phys().
* We're either generating a label checksum, or none at all.
*/
if (checksum == ZIO_CHECKSUM_OFF)
return (ZIO_PIPELINE_CONTINUE);
} else {
} else {
}
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
int error;
/*
* This is zio_read_phys().
* We're either verifying a label checksum, or nothing at all.
*/
return (ZIO_PIPELINE_CONTINUE);
}
}
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
void
{
}
/*
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
* An error of 0 indictes success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
* ==========================================================================
*/
int
{
break;
break;
}
/*
* ==========================================================================
* I/O completion
* ==========================================================================
*/
static int
{
return (ZIO_PIPELINE_STOP);
}
/*
* As we notify zio's parents, new parents could be added.
* New parents go to the head of zio's io_parent_list, however,
* so we will (correctly) not notify them. The remainder of zio's
* io_parent_list, from 'pio_next' onward, cannot change because
* all parents must wait for us to be done before they can be done.
*/
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
/*
* If our children haven't all completed,
* wait for them and then repeat this pipeline stage.
*/
return (ZIO_PIPELINE_STOP);
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
}
}
/*
* If there were child vdev or gang errors, they apply to us now.
*/
/*
* If this I/O is attached to a particular vdev,
* generate an error message describing the I/O failure
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
0, 0);
}
}
/*
* Determine whether zio should be reexecuted. This will
* propagate all the way to the root via zio_notify_parent().
*/
if (IO_IS_ALLOCATING(zio))
else
}
/*
* If there were logical child errors, they apply to us now.
* We defer this until now to avoid conflating logical child
* errors with errors that happened to the zio itself when
* updating vdev stats and reporting FMA events above.
*/
}
/*
*/
zio->io_reexecute = 0;
if (zio->io_reexecute) {
/*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
* the root, it simply notifies its parent and sticks around.
* The parent, seeing that it still has children in zio_done(),
* does the same. This percolates all the way up to the root.
* The root i/o will reexecute or suspend the entire tree.
*
* This approach ensures that zio_reexecute() honors
* all the original i/o dependency relationships, e.g.
* parents not executing until children are ready.
*/
/*
* "The Godfather" I/O monitors its children but is
* not a true parent to them. It will track them through
* the pipeline but severs its ties whenever they get into
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
}
}
/*
* We're not a root i/o, so there's nothing to do
* but notify our parent. Don't propagate errors
* upward since we haven't permanently failed yet.
*/
/*
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
} else {
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
(void) taskq_dispatch(
}
return (ZIO_PIPELINE_STOP);
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
*/
}
} else {
}
return (ZIO_PIPELINE_STOP);
}
/*
* ==========================================================================
* I/O pipeline definition
* ==========================================================================
*/
NULL,
};