zio.c revision 7adb730b589e553bf3b1ccfd9bae2df91c5c1061
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/sysmacros.h>
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
#include <sys/dmu_objset.h>
#include <sys/zfeature.h>
/*
* ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
const char *zio_type_name[ZIO_TYPES] = {
"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
"zio_ioctl"
};
/*
* ==========================================================================
* I/O kmem caches
* ==========================================================================
*/
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
/*
* The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action.
* Care should be taken when changing these values as they directly impact
* spa_sync() performance. Tuning these values may introduce subtle performance
* pathologies and should only be done in the context of performance analysis.
* These tunables will eventually be removed and replaced with #defines once
* enough analysis has been done to determine optimal values.
*
* The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
* regular blocks are not deferred.
*/
/*
* An allocating zio is one that either currently has the DVA allocate
* stage set or will have it later in its lifetime.
*/
#ifdef ZFS_DEBUG
int zio_buf_debug_limit = 16384;
#else
int zio_buf_debug_limit = 0;
#endif
void
zio_init(void)
{
size_t c;
#ifdef _KERNEL
#endif
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For larger buffers, we want a cache
* for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
#ifndef _KERNEL
/*
* If we are using watchpoints, put each buffer on its own page,
* to eliminate the performance overhead of trapping to the
* kernel when modifying a non-watched buffer that shares the
* page with a watched buffer.
*/
continue;
#endif
}
if (align != 0) {
char name[36];
/*
* Since zio_data bufs do not appear in crash dumps, we
* pass KMC_NOTOUCH so that no allocator metadata is
* stored with the buffers.
*/
cflags | KMC_NOTOUCH);
}
}
while (--c != 0) {
}
}
void
zio_fini(void)
{
size_t c;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
}
zio_buf_cache[c] = NULL;
if (zio_data_buf_cache[c] != last_data_cache) {
}
zio_data_buf_cache[c] = NULL;
}
}
/*
* ==========================================================================
* Allocate and free I/O buffers
* ==========================================================================
*/
/*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
* excess / transient data in-core during a crashdump.
*/
void *
{
}
/*
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
* crashdump if the kernel panics. This exists so that we will limit the amount
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
* of kernel heap dumped to disk when the kernel panics)
*/
void *
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
*/
static void
{
}
static void
{
if (zt->zt_bufsize != 0)
}
}
/*
* ==========================================================================
* I/O transform callbacks for subblocks and decompression
* ==========================================================================
*/
static void
{
}
static void
{
}
/*
* ==========================================================================
* ==========================================================================
*/
/*
* NOTE - Callers to zio_walk_parents() and zio_walk_children must
* continue calling these functions until they return NULL.
* Otherwise, the next caller will pick up the list walk in
* some indeterminate state. (Otherwise every caller would
* have to pass in a cookie to keep the state represented by
* io_walk_link, which gets annoying.)
*/
zio_t *
{
return (NULL);
}
zio_t *
{
return (NULL);
}
zio_t *
{
return (pio);
}
void
{
/*
* The following ASSERT captures all of these constraints.
*/
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_child_count++;
cio->io_parent_count++;
}
static void
{
pio->io_child_count--;
cio->io_parent_count--;
}
static boolean_t
{
if (*countp != 0) {
}
return (waiting);
}
static void
{
(*countp)--;
} else {
}
}
static void
{
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
* ==========================================================================
*/
static zio_t *
{
else if (flags & ZIO_FLAG_GANG_CHILD)
else if (flags & ZIO_FLAG_DDT_CHILD)
else
if (type != ZIO_TYPE_WRITE ||
}
}
return (zio);
}
static void
{
}
zio_t *
{
return (zio);
}
zio_t *
{
}
void
{
zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
}
zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
}
zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
}
zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
}
zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
}
if (BP_IS_EMBEDDED(bp)) {
zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
}
}
/*
* Pool-specific checks.
*
* Note: it would be nice to verify that the blk_birth and
* BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
* allows the birth time of log blocks (and dmu_sync()-ed blocks
* that are in the log) to be arbitrarily large.
*/
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
zfs_panic_recover("blkptr at %p DVA %u has invalid "
"VDEV %llu",
}
zfs_panic_recover("blkptr at %p DVA %u has invalid "
"VDEV %llu",
}
zfs_panic_recover("blkptr at %p DVA %u has hole "
"VDEV %llu",
}
/*
* "missing" vdevs are valid during import, but we
* don't have their detailed info (e.g. asize), so
* we can't perform any more checks on them.
*/
continue;
}
if (BP_IS_GANG(bp))
zfs_panic_recover("blkptr at %p DVA %u has invalid "
"OFFSET %llu",
}
}
}
zio_t *
{
return (zio);
}
zio_t *
void *private,
{
/*
* Data can be NULL if we are going to call zio_write_override() to
* provide the already-allocated BP. But we may need the data to
* verify a dedup hit (if requested). In this case, don't try to
* dedup (just take the already-allocated BP verbatim).
*/
}
return (zio);
}
zio_t *
{
return (zio);
}
void
{
/*
* We must reset the io_prop to match the values that existed
* when the bp was first written by dmu_sync() keeping in mind
* that nopwrite and dedup are mutually exclusive.
*/
}
void
{
/*
* The check for EMBEDDED is a performance optimization. We
* process the free here (by ignoring it) rather than
* putting it on the list and then processing it in zio_free_sync().
*/
if (BP_IS_EMBEDDED(bp))
return;
/*
* Frees that are for the currently-syncing txg, are not going to be
* deferred, and which will not need to do a read (i.e. not GANG or
* DEDUP), can be processed immediately. Otherwise, put them on the
* in-memory list for later processing.
*/
} else {
}
}
zio_t *
{
if (BP_IS_EMBEDDED(bp))
/*
* GANG and DEDUP blocks can induce a read (for the gang block header,
* or the DDT), so issue them asynchronously so that this thread is
* not tied up.
*/
return (zio);
}
zio_t *
{
if (BP_IS_EMBEDDED(bp))
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
* immediate writes contain committed data, but in a txg that was
* *not* committed. Upon opening the pool after an unclean shutdown,
* the intent log claims all blocks that contain immediate write data
* so that the SPA knows they're in use.
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
* If txg == 0 we just verify that the block is claimable.
*/
return (zio);
}
zio_t *
{
int c;
if (vd->vdev_children == 0) {
} else {
for (c = 0; c < vd->vdev_children; c++)
}
return (zio);
}
zio_t *
{
return (zio);
}
zio_t *
{
/*
* zec checksums are necessarily destructive -- they modify
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
*/
}
return (zio);
}
/*
* Create a child I/O to do some work for us.
*/
zio_t *
{
/*
* If we have the bp, then the child should perform the
* checksum and the parent need not. This pushes error
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
}
if (vd->vdev_children == 0)
/*
* If we've decided to do a repair, the write is not speculative --
* even if the original read was.
*/
if (flags & ZIO_FLAG_IO_REPAIR)
return (zio);
}
zio_t *
{
return (zio);
}
void
{
}
void
{
/*
* We don't shrink for raidz because of problems with the
* reconstruction when reading back less than the block size.
* Note, BP_IS_RAIDZ() assumes no compression.
*/
}
/*
* ==========================================================================
* Prepare to read and write logical blocks
* ==========================================================================
*/
static int
{
}
} else {
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
int pass = 1;
/*
* If our children haven't all reached the ready stage,
* wait for them and then repeat this pipeline stage.
*/
return (ZIO_PIPELINE_STOP);
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
if (zio->io_bp_override) {
if (BP_IS_EMBEDDED(bp))
return (ZIO_PIPELINE_CONTINUE);
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
* has already occurred.
*/
return (ZIO_PIPELINE_CONTINUE);
}
return (ZIO_PIPELINE_CONTINUE);
return (ZIO_PIPELINE_CONTINUE);
}
}
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
* converge, it must eventually be the case that we don't
* have to allocate new blocks. But compression changes
* the blocksize, which forces a reallocate, and makes
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
if (pass >= zfs_sync_pass_dont_compress)
/* Make sure someone doesn't change their mind on overwrites */
}
if (compress != ZIO_COMPRESS_OFF) {
return (ZIO_PIPELINE_CONTINUE);
} else {
/*
* Round up compressed size to MINBLOCKSIZE and
* zero the tail.
*/
}
} else {
}
}
}
/*
* The final pass of spa_sync() must be all rewrites, but the first
* few passes offer a trade-off: allocating blocks defers convergence,
* but newly allocated blocks are sequential, so they can be written
* to disk faster. Therefore, we allow the first few passes of
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
pass >= zfs_sync_pass_rewrite) {
} else {
}
if (psize == 0) {
}
} else {
}
if (zp->zp_nopwrite) {
}
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
if (BP_GET_DEDUP(bp))
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Execute the I/O pipeline
* ==========================================================================
*/
static void
{
/*
* If we're a config writer or a probe, the normal issue and
* interrupt threads may all be blocked waiting for the config lock.
* In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
*/
t = ZIO_TYPE_NULL;
/*
* A similar issue exists for the L2ARC write thread until L2ARC 2.0.
*/
t = ZIO_TYPE_NULL;
/*
* If this is a high priority I/O, then use the high priority taskq if
* available.
*/
q++;
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
/*
* NB: We are assuming that the zio can only be dispatched
* to a single taskq at a time. It would be a grievous error
* to dispatch the zio to another taskq at the same time.
*/
}
static boolean_t
{
for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
uint_t i;
for (i = 0; i < tqs->stqs_count; i++) {
return (B_TRUE);
}
}
return (B_FALSE);
}
static int
{
return (ZIO_PIPELINE_STOP);
}
void
{
}
/*
* Execute the I/O pipeline until one of the following occurs:
*
* (1) the I/O completes
* (3) the I/O issues, so we're waiting for an I/O completion interrupt
* (4) the I/O is delegated by vdev-level caching or aggregation
* (5) the I/O is deferred due to vdev-level queueing
* (6) the I/O is handed off to another thread.
*
* In all cases, the pipeline stops whenever there's no CPU work; it never
* burns a thread in cv_wait().
*
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
*/
static zio_pipe_stage_t *zio_pipeline[];
void
{
int rv;
do {
stage <<= 1;
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
* or may wait for an I/O that needs an interrupt thread
* to complete, issue async to avoid deadlock.
*
* For VDEV_IO_START, we cut in line so that the io will
* be sent to disk promptly.
*/
return;
}
if (rv == ZIO_PIPELINE_STOP)
return;
}
}
/*
* ==========================================================================
* Initiate I/O, either sync or async
* ==========================================================================
*/
int
{
int error;
return (error);
}
void
{
/*
* This is a logical async I/O with no parent to wait for it.
* We add it to the spa_async_root_zio "Godfather" I/O which
* will ensure they complete prior to unloading the pool.
*/
}
}
/*
* ==========================================================================
* ==========================================================================
*/
static void
{
pio->io_reexecute = 0;
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
if (IO_IS_ALLOCATING(pio))
/*
* As we reexecute pio's children, new children could be created.
* New children go to the head of pio's io_child_list, however,
* so we will (correctly) not reexecute them. The key is that
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
}
/*
* Now that all children have been reexecuted, execute the parent.
* We don't reexecute "The Godfather" I/O here as it's the
* responsibility of the caller to wait on him.
*/
}
void
{
fm_panic("Pool '%s' has encountered an uncorrectable I/O "
"failure and the failure mode property for this pool "
}
}
int
{
/*
* Reexecute all previously suspended i/o.
*/
return (0);
}
void
{
while (spa_suspended(spa))
}
/*
* ==========================================================================
* Gang blocks.
*
* A gang block is a collection of small blocks that looks to the DMU
* like one large block. When zio_dva_allocate() cannot find a block
* of the requested size, due to either severe fragmentation or the pool
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
* an indirect block: it's an array of block pointers. It consumes
* only one sector and hence is allocatable regardless of fragmentation.
* The gang header's bps point to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
* as the verifier to ensure uniqueness of the SHA256 checksum.
* Critically, the gang block bp's blk_cksum is the checksum of the data,
* not the gang header. This ensures that data block signatures (needed for
* deduplication) are independent of how the block is physically stored.
*
* Gang blocks can be nested: a gang member may itself be a gang block.
* Thus every gang block is a tree in which root and all interior nodes are
* gang headers, and the leaves are normal blocks that contain user data.
* The root of the gang tree is called the gang leader.
*
* To perform any operation (read, rewrite, free, claim) on a gang block,
* zio_gang_assemble() first assembles the gang tree (minus data leaves)
* in the io_gang_tree field of the original logical i/o by recursively
* reading the gang leader and all gang headers below it. This yields
* an in-core tree containing the contents of every gang header and the
* bps for every constituent of the gang block.
*
* With the gang tree now assembled, zio_gang_issue() just walks the gang tree
* and invokes a callback on each bp. To free a gang block, zio_gang_issue()
* calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
* zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
* zio_read_gang() is a wrapper around zio_read() that omits reading gang
* headers, since we already have those in io_gang_tree. zio_rewrite_gang()
* performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
* of the gang header plus zio_checksum_compute() of the data to update the
* gang header's blk_cksum as described above.
*
* what if you'd freed part of a gang block but then couldn't read the
* gang header for another part? Assembling the entire gang tree first
* ensures that all the necessary gang header I/O has succeeded before
* starting the actual work of free, claim, or write. Once the gang tree
* is assembled, free and claim are in-memory operations that cannot fail.
*
* In the event that a gang write fails, zio_dva_unallocate() walks the
* gang tree to immediately free (i.e. insert back into the space map)
* everything we've allocated. This ensures that we don't get ENOSPC
*
* Gang rewrites only happen during sync-to-convergence. If we can't assemble
* the gang tree, we won't modify the block, so we can safely defer the free
* (knowing that the block is still intact). If we *can* assemble the gang
* tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
* each constituent bp and we can allocate a new block on the next sync pass.
*
* In all cases, the gang tree allows complete recovery from partial failure.
* ==========================================================================
*/
static zio_t *
{
return (pio);
&pio->io_bookmark));
}
zio_t *
{
/*
* As we rewrite each gang header, the pipeline will compute
* a new gang block header checksum for it; but no one will
* compute a new data checksum, so we do that here. The one
* exception is the gang leader: the pipeline already computed
* its data checksum because that stage precedes gang assembly.
* (Presently, nothing actually uses interior data checksums;
* this is just good hygiene.)
*/
}
/*
* If we are here to damage data for testing purposes,
* leave the GBH alone so that we can detect the damage.
*/
} else {
}
return (zio);
}
/* ARGSUSED */
zio_t *
{
}
/* ARGSUSED */
zio_t *
{
}
NULL,
};
static zio_gang_node_t *
{
return (gn);
}
static void
{
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
}
static void
{
return;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
}
static void
{
}
static void
{
return;
if (BP_SHOULD_BYTESWAP(bp))
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
if (!BP_IS_GANG(gbp))
continue;
}
}
static void
{
/*
* If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL.
*/
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
if (BP_IS_HOLE(gbp))
continue;
}
}
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_STOP);
else
return (ZIO_PIPELINE_CONTINUE);
}
static void
{
return;
}
}
static int
{
int error;
if (error) {
return (ZIO_PIPELINE_CONTINUE);
}
} else {
}
/*
* Create the gang header.
*/
/*
* Create and nowait the gang children.
*/
&pio->io_bookmark));
}
/*
* Set pio's pipeline to just wait for zio to finish.
*/
return (ZIO_PIPELINE_CONTINUE);
}
/*
* The zio_nop_write stage in the pipeline determines if allocating
* a new bp is necessary. By leveraging a cryptographically secure checksum,
* such as SHA256, we can compare the checksums of the new data and the old
* to determine if allocating a new block is required. The nopwrite
* feature can handle writes in either syncing or open context (i.e. zil
* writes) and as a result is mutually exclusive with dedup.
*/
static int
{
/*
* Check to see if the original bp and the new bp have matching
* characteristics (i.e. same checksum, compression algorithms, etc).
* If they don't then just continue with the pipeline which will
* allocate a new bp.
*/
if (BP_IS_HOLE(bp_orig) ||
return (ZIO_PIPELINE_CONTINUE);
/*
* If the checksums match then reset the pipeline so that we
* avoid allocating a new bp and issuing any I/O.
*/
sizeof (uint64_t)) == 0);
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Dedup
* ==========================================================================
*/
static void
{
else
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
continue;
&blk);
&zio->io_bookmark));
}
return (ZIO_PIPELINE_CONTINUE);
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_STOP);
return (ZIO_PIPELINE_CONTINUE);
}
return (ZIO_PIPELINE_STOP);
}
}
}
return (ZIO_PIPELINE_CONTINUE);
}
static boolean_t
{
/*
* Note: we compare the original data, not the transformed data,
* because when zio->io_bp is an override bp, we will not have
* pushed the I/O transforms. That's an important optimization
*/
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
zio->io_orig_size) != 0);
}
}
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
if (ddp->ddp_phys_birth != 0) {
int error;
if (error == 0) {
zio->io_orig_size) != 0)
}
return (error != 0);
}
}
return (B_FALSE);
}
static void
{
return;
}
static void
{
} else {
}
}
static void
{
int p = DDT_PHYS_DITTO;
if (ddp->ddp_phys_birth != 0)
}
}
static int
{
int ditto_copies;
/*
* If we're using a weak checksum, upgrade to a strong checksum
* and try again. If we're already using a strong checksum,
* we can't resolve it, so just convert to an ordinary write.
* (And automatically e-mail a paper to Nature?)
*/
} else {
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* If we arrived here with an override bp, we won't have run
* the transform stack, so we won't have the data we need to
* generate a child i/o. So, toss the override bp and restart.
* This is safe, because using the override bp is just an
* optimization; and it's rare, so the cost doesn't matter.
*/
if (zio->io_bp_override) {
return (ZIO_PIPELINE_CONTINUE);
}
}
if (ddp->ddp_phys_birth != 0)
else
} else if (zio->io_bp_override) {
} else {
}
if (cio)
if (dio)
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Allocate and free blocks
* ==========================================================================
*/
static int
{
int error;
int flags = 0;
}
/*
* The dump device does not support gang blocks so allocation on
* behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
* the "fast" gang feature.
*/
METASLAB_GANG_CHILD : 0;
if (error) {
error);
return (zio_write_gang_block(zio));
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
int error;
if (error)
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Undo an allocation. This is used by zio_done() when an I/O fails
* and we want to give back the block we just allocated.
* This handles both normal blocks and gang blocks.
*/
static void
{
if (!BP_IS_HOLE(bp))
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
}
}
}
/*
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
{
int error = 1;
/*
* ZIL blocks are always contiguous (i.e. not gang blocks) so we
* set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
* when allocating them.
*/
if (use_slog) {
}
if (error) {
}
if (error == 0) {
BP_SET_LEVEL(new_bp, 0);
BP_SET_DEDUP(new_bp, 0);
}
return (error);
}
/*
* Free an intent log block.
*/
void
{
}
/*
* ==========================================================================
* Read and write to physical devices
* ==========================================================================
*/
/*
* Issue an I/O to the underlying vdev. Typically the issue pipeline
* stops after this stage and will resume upon I/O completion.
* However, there are instances where the vdev layer may need to
* continue the pipeline when an I/O was not issued. Since the I/O
* that was sent to the vdev layer might be different than the one
* currently active in the pipeline (see vdev_queue_io()), we explicitly
* force the underlying vdev layers to call either zio_execute() or
* zio_interrupt() to ensure that the pipeline continues with the correct I/O.
*/
static int
{
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
return (ZIO_PIPELINE_STOP);
}
/*
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
* - synchronous writes of user data to non-slog devices
* - any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
*/
}
/* Transform logical writes to be a full physical block size. */
}
}
/*
* If this is not a physical io, make sure that it is properly aligned
* before proceeding.
*/
} else {
/*
* For physical writes, we allow 512b aligned writes and assume
* the device will perform a read-modify-write as necessary.
*/
}
/*
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
* This prevents spurious resilvering with nested replication.
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
* A is out of date, we'll read from C+D, then use the data to
* resilver A+B -- but we don't actually want to resilver B, just A.
* The top-level mirror has no way to know this, so instead we just
* discard unnecessary repairs as we work our way down the vdev tree.
* The same logic applies to any form of nested replication:
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
*/
return (ZIO_PIPELINE_CONTINUE);
}
return (ZIO_PIPELINE_CONTINUE);
return (ZIO_PIPELINE_STOP);
return (ZIO_PIPELINE_STOP);
}
}
return (ZIO_PIPELINE_STOP);
}
static int
{
return (ZIO_PIPELINE_STOP);
} else {
}
}
}
if (unexpected_error)
return (ZIO_PIPELINE_CONTINUE);
}
/*
* For non-raidz ZIOs, we can just copy aside the bad data read from the
* disk, and use that to finish the checksum ereport later.
*/
static void
const void *good_buf)
{
/* no processing needed */
}
/*ARGSUSED*/
void
{
}
static int
{
return (ZIO_PIPELINE_STOP);
}
/*
* If the I/O failed, determine whether we should attempt to retry it.
*
* On retry, we cut in line in the issue queue, since we don't want
* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
*/
return (ZIO_PIPELINE_STOP);
}
/*
* If we got an error on a leaf device, convert it to ENXIO
* if the device is not accessible at all.
*/
/*
* If we can't write to an interior vdev (mirror or RAID-Z),
* set vdev_cant_write so that we stop trying to allocate from it.
*/
}
}
return (ZIO_PIPELINE_CONTINUE);
}
void
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Generate and verify checksums
* ==========================================================================
*/
static int
{
enum zio_checksum checksum;
/*
* This is zio_write_phys().
* We're either generating a label checksum, or none at all.
*/
if (checksum == ZIO_CHECKSUM_OFF)
return (ZIO_PIPELINE_CONTINUE);
} else {
} else {
}
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
int error;
/*
* This is zio_read_phys().
* We're either verifying a label checksum, or nothing at all.
*/
return (ZIO_PIPELINE_CONTINUE);
}
}
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
void
{
}
/*
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
* An error of 0 indicates success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
* ==========================================================================
*/
int
{
break;
break;
}
/*
* ==========================================================================
* I/O completion
* ==========================================================================
*/
static int
{
return (ZIO_PIPELINE_STOP);
}
/*
* As we notify zio's parents, new parents could be added.
* New parents go to the head of zio's io_parent_list, however,
* so we will (correctly) not notify them. The remainder of zio's
* io_parent_list, from 'pio_next' onward, cannot change because
* all parents must wait for us to be done before they can be done.
*/
}
if (BP_IS_GANG(bp)) {
} else {
}
}
if (zio_injection_enabled &&
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
/*
* If our children haven't all completed,
* wait for them and then repeat this pipeline stage.
*/
return (ZIO_PIPELINE_STOP);
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
}
}
/*
*/
/*
* If the I/O on the transformed data was successful, generate any
* checksum reports now while we still have the transformed data.
*/
}
}
}
/*
* If this I/O is attached to a particular vdev,
* generate an error message describing the I/O failure
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
0, 0);
}
}
/*
* Determine whether zio should be reexecuted. This will
* propagate all the way to the root via zio_notify_parent().
*/
if (IO_IS_ALLOCATING(zio) &&
else
}
/*
* Here is a possibly good place to attempt to do
* either combinatorial reconstruction or error correction
* based on checksums. It also might be a good place
* to send out preliminary ereports before we suspend
* processing.
*/
}
/*
* If there were logical child errors, they apply to us now.
* We defer this until now to avoid conflating logical child
* errors with errors that happened to the zio itself when
* updating vdev stats and reporting FMA events above.
*/
/*
*/
zio->io_reexecute = 0;
if (zio->io_reexecute) {
/*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
* the root, it simply notifies its parent and sticks around.
* The parent, seeing that it still has children in zio_done(),
* does the same. This percolates all the way up to the root.
* The root i/o will reexecute or suspend the entire tree.
*
* This approach ensures that zio_reexecute() honors
* all the original i/o dependency relationships, e.g.
* parents not executing until children are ready.
*/
/*
* "The Godfather" I/O monitors its children but is
* not a true parent to them. It will track them through
* the pipeline but severs its ties whenever they get into
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
}
}
/*
* We're not a root i/o, so there's nothing to do
* but notify our parent. Don't propagate errors
* upward since we haven't permanently failed yet.
*/
/*
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
} else {
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
}
return (ZIO_PIPELINE_STOP);
}
/*
* Report any checksum errors, since the I/O is complete.
*/
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
*/
}
} else {
}
return (ZIO_PIPELINE_STOP);
}
/*
* ==========================================================================
* I/O pipeline definition
* ==========================================================================
*/
static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
};
/* dnp is the dnode for zb1->zb_object */
const zbookmark_phys_t *zb2)
{
/* The objset_phys_t isn't before anything. */
return (B_FALSE);
return (nextobj <= zb2thisobj);
}
return (B_TRUE);
return (B_FALSE);
return (B_FALSE);
}