zio.c revision 1f7ad2e1275fff503991bf4b43bc5cf1d815669f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
/*
* ==========================================================================
* I/O priority table
* ==========================================================================
*/
0, /* ZIO_PRIORITY_NOW */
0, /* ZIO_PRIORITY_SYNC_READ */
0, /* ZIO_PRIORITY_SYNC_WRITE */
6, /* ZIO_PRIORITY_ASYNC_READ */
4, /* ZIO_PRIORITY_ASYNC_WRITE */
4, /* ZIO_PRIORITY_FREE */
0, /* ZIO_PRIORITY_CACHE_FILL */
0, /* ZIO_PRIORITY_LOG_WRITE */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
};
/*
* ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
"null", "read", "write", "free", "claim", "ioctl" };
/* At or above this size, force gang blocking - for testing */
/* Force an allocation failure when non-zero */
int zio_write_retry = 1;
int zio_resume_threads = 4;
typedef struct zio_sync_pass {
int zp_defer_free; /* defer frees after this pass */
int zp_dontcompress; /* don't compress after this pass */
int zp_rewrite; /* rewrite new bps after this pass */
1, /* zp_defer_free */
4, /* zp_dontcompress */
1, /* zp_rewrite */
};
/*
* ==========================================================================
* I/O kmem caches
* ==========================================================================
*/
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
/*
* Determine if we are allowed to issue the IO based on the
* pool state. If we must wait then block until we are told
* that we may continue.
*/
} \
}
/*
* An allocation zio is one that either currently has the DVA allocate
* stage set or will have it later in it's lifetime.
*/
#define IO_IS_ALLOCATING(zio) \
/*
* The only way to tell is by looking for the gang pipeline stage
*/
#define IO_IS_REWRITE(zio) \
void
zio_init(void)
{
size_t c;
#ifdef _KERNEL
#endif
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
* for each quarter-power of 2. For large buffers, we want
* a cache for each multiple of PAGESIZE.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
}
if (align != 0) {
char name[36];
dprintf("creating cache for size %5lx align %5lx\n",
}
}
while (--c != 0) {
}
}
void
zio_fini(void)
{
size_t c;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
}
zio_buf_cache[c] = NULL;
if (zio_data_buf_cache[c] != last_data_cache) {
}
zio_data_buf_cache[c] = NULL;
}
}
/*
* ==========================================================================
* Allocate and free I/O buffers
* ==========================================================================
*/
/*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
* excess / transient data in-core during a crashdump.
*/
void *
{
}
/*
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
* crashdump if the kernel panics. This exists so that we will limit the amount
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
* of kernel heap dumped to disk when the kernel panics)
*/
void *
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
*/
static void
{
}
static void
{
}
}
static void
{
void *data;
}
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free)
* ==========================================================================
*/
static zio_t *
{
BP_GET_LEVEL(bp) != 0)
}
/*
* Note on config lock:
*
* If CONFIG_HELD is set, then the caller already has the config
* lock, so we don't need it for this io.
*
* We set CONFIG_GRABBED to indicate that we have grabbed the
* config lock on behalf of this io, so it should be released
* in zio_done.
*
* Unless CONFIG_HELD is set, we will grab the config lock for
* any top-level (parent-less) io, *except* NULL top-level ios.
* The NULL top-level ios rarely have any children, so we delay
* grabbing the lock until the first child is added (but it is
* still grabbed on behalf of the top-level i/o, so additional
* children don't need to also grab it). This greatly reduces
* contention on the config lock.
*/
if (type != ZIO_TYPE_NULL &&
!(flags & ZIO_FLAG_CONFIG_HELD)) {
}
} else {
if (!(flags & ZIO_FLAG_NOBOOKMARK))
}
if (stage < ZIO_STAGE_READY)
}
/*
* Save off the original state incase we need to retry later.
*/
return (zio);
}
static void
{
}
zio_t *
int flags)
{
return (zio);
}
zio_t *
{
}
zio_t *
{
/*
* then attempt to satisfy the read.
*/
/*
* Work off our copy of the bp so the caller can free it.
*/
return (zio);
}
zio_t *
{
if (compress != ZIO_COMPRESS_OFF)
/* XXX the bp usually (always?) gets re-zeroed later */
} else {
/* Make sure someone doesn't change their mind on overwrites */
}
return (zio);
}
zio_t *
{
return (zio);
}
static void
{
/* Free up the previous block */
}
}
static zio_t *
{
return (zio);
}
zio_t *
{
}
return (zio);
}
zio_t *
{
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
* immediate writes contain committed data, but in a txg that was
* *not* committed. Upon opening the pool after an unclean shutdown,
* the intent log claims all blocks that contain immediate write data
* so that the SPA knows they're in use.
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
*/
return (zio);
}
zio_t *
{
int c;
if (vd->vdev_children == 0) {
} else {
for (c = 0; c < vd->vdev_children; c++)
}
return (zio);
}
static void
int checksum)
{
if (checksum != ZIO_CHECKSUM_OFF)
}
zio_t *
{
/*
* Work off our copy of the bp so the caller can free it.
*/
return (zio);
}
zio_t *
{
void *wbuf;
/*
* zbt checksums are necessarily destructive -- they modify
* Therefore, we must make a local copy in case the data is
* being written to multiple places.
*/
}
return (zio);
}
/*
* Create a child I/O to do some work for us. It has no associated bp.
*/
zio_t *
{
/*
* If we have the bp, then the child should perform the
* checksum and the parent need not. This pushes error
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
}
return (cio);
}
/*
* ==========================================================================
* Initiate I/O, either sync or async
* ==========================================================================
*/
int
{
int error;
return (error);
}
void
{
}
/*
* ==========================================================================
* ==========================================================================
*/
static void
{
if (*countp == 0) {
} else {
}
}
static void
{
pio->io_stalled = 0;
} else {
}
}
static void
{
}
void
{
}
static void
{
}
}
}
static void
{
}
static void
{
/*
* Preserve the failed bp so that the io_ready() callback can
* update the accounting accordingly. The callback will also be
* responsible for freeing the previously allocated block, if one
* exists.
*/
/*
* We must zero out the old DVA and blk_birth before reallocating
* the bp. We don't want to do this if this is a rewrite however.
*/
if (!IO_IS_REWRITE(zio)) {
}
if (pio) {
/*
* Let the parent know that we will
* re-alloc the write (=> new bp info).
*/
/*
* If the parent I/O is still in the open stage, then
* don't bother telling it to retry since it hasn't
* progressed far enough for it to care.
*/
}
/*
* We are getting ready to process the retry request so clear
* the flag and the zio's current error status.
*/
}
int
{
/*
* Probe all of vdevs that have experienced an I/O error.
* If we are still unable to verify the integrity of the vdev
* then we prevent the resume from proceeeding.
*/
int error = 0;
continue;
if (error) {
return (error);
}
}
/*
* Clear the vdev stats so that I/O can flow.
*/
/*
* If we are resuming an allocating I/O then we force it
* to retry and let it resume operation where it left off.
* Otherwise, go back to the ready stage and pick up from
* there.
*/
} else {
}
}
/*
* Wait for the taskqs to finish and recheck the pool state since
* it's possible that a resumed I/O has failed again.
*/
return (EIO);
return (0);
}
static void
{
/*
* We've experienced an unrecoverable failure so
* set the pool state accordingly and queue all
* failed IOs.
*/
#ifndef _KERNEL
/* Used to notify ztest that the pool has suspended */
#endif
}
static void
{
}
}
/*
* Some child I/O has indicated that a retry is necessary, so
* we set an error on the I/O and let the logic below do the
* rest.
*/
/*
* If this I/O is attached to a particular vdev,
* generate an error message describing the I/O failure
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
/*
* For root I/O requests, tell the SPA to log the error
* appropriately. Also, generate a logical data
* ereport.
*/
0, 0);
}
/*
* If we are an allocating I/O or have been told to retry
* then attempt to reissue the I/O on another vdev unless
* the pool is out of space. We handle this condition
* based on the spa's failmode property.
*/
(IO_IS_ALLOCATING(zio) ||
return;
}
/*
* For I/O requests that cannot fail, we carry out
* the requested behavior based on the failmode pool
* property.
*
* XXX - Need to differentiate between an ENOSPC as
* a result of vdev failures vs. a full pool.
*/
char *blkbuf;
#ifdef ZFS_DEBUG
if (blkbuf) {
}
"bad checksum" : "I/O failure",
#endif
fm_panic("Pool '%s' has encountered an "
"uncorrectable I/O failure and the "
"failure mode property for this pool "
} else {
"an uncorrectable I/O error. Manual "
"intervention is required.",
}
return;
}
}
}
static void
{
}
/*
* Note: this I/O is now done, and will shortly be freed, so there is no
* need to clear this (or any other) flag.
*/
} else {
}
}
/*
* ==========================================================================
* Compression support
* ==========================================================================
*/
static void
{
void *cbuf;
int pass;
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
* converge, it must eventually be the case that we don't
* have to allocate new blocks. But compression changes
* the blocksize, which forces a reallocate, and makes
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
} else {
pass = 1;
}
if (compress != ZIO_COMPRESS_OFF)
/*
* The final pass of spa_sync() must be all rewrites, but the first
* few passes offer a trade-off: allocating blocks defers convergence,
* but newly allocated blocks are sequential, so they can be written
* to disk faster. Therefore, we allow the first few passes of
* spa_sync() to reallocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
} else {
if (csize == 0) {
} else {
}
}
}
static void
{
void *data;
}
/*
* ==========================================================================
* Gang block support
* ==========================================================================
*/
static void
{
/*
* By default, the pipeline assumes that we're dealing with a gang
* block. If we're not, strip out any gang-specific stages.
*/
}
static void
{
}
static void
{
}
static void
{
int i;
ASSERT(i < SPA_GBH_NBLKPTRS);
&zio->io_bookmark));
}
}
static void
{
int i;
ASSERT(i < SPA_GBH_NBLKPTRS);
&zio->io_bookmark));
}
}
static void
{
int i;
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
if (BP_IS_HOLE(gbp))
continue;
}
}
static void
{
int i;
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
if (BP_IS_HOLE(gbp))
continue;
}
}
static void
{
int d;
}
}
static int
{
int error;
int i, d;
B_FALSE);
if (error)
return (error);
for (d = 0; d < gbh_ndvas; d++)
/* We need to test multi-level gang blocks */
if (error == 0)
break;
/* XXX - free up previous allocations? */
if (maxalloc == SPA_MINBLOCKSIZE)
return (error);
}
&zio->io_bookmark));
} else {
}
}
/*
* As much as we'd like this to be zio_wait_children_ready(),
* updating our ASIZE doesn't happen until the io_done callback,
* so we have to wait for that to finish in order for our BP
* to be stable.
*/
return (0);
}
/*
* ==========================================================================
* Allocate and free blocks
* ==========================================================================
*/
static void
{
int error;
/* For testing, make some blocks above a certain size be gang blocks */
if (error)
return;
}
/*
* writes. We do this after the gang block testing block so that
* they don't inherit the retry flag.
*/
if (zio_io_fail_shift &&
if (error == 0) {
if (error == 0)
return;
} else {
}
}
static void
{
}
static void
{
}
/*
* ==========================================================================
* Read and write to physical devices
* ==========================================================================
*/
static void
{
/*
* If the pool is already in a failure state then just suspend
* this IO until the problem is resolved. We will reissue them
* at that time.
*/
return;
}
/* The mirror_ops handle multiple DVAs in a single BP */
return;
}
vd->vdev_children == 0) {
}
}
}
/* zio_next_stage_async() gets called from io completion interrupt */
}
static void
{
/* The mirror_ops handle multiple DVAs in a single BP */
else
}
/* XXPOLICY */
{
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
if (zio->io_retries > 0)
return (B_FALSE);
return (B_TRUE);
}
static void
{
void *abuf;
}
/*
* If the I/O failed, determine whether we should attempt to retry it.
*/
/* XXPOLICY */
if (zio_should_retry(zio)) {
zio->io_retries++;
/* XXPOLICY */
dprintf("retry #%d for %s to %s offset %llx\n",
return;
}
}
void
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Generate and verify checksums
* ==========================================================================
*/
static void
{
}
static void
{
}
static void
{
}
}
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
void
{
}
/*
* Set the external verifier for a gang block based on stuff in the bp
*/
void
{
}
/*
* ==========================================================================
* Define the pipeline
* ==========================================================================
*/
static void
{
}
};
/*
* Move an I/O to the next stage of the pipeline and execute that stage.
* There's no locking on io_stage because there's no legitimate way for
* multiple threads to be attempting to process the same I/O.
*/
void
{
dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
}
continue;
/*
* See the comment in zio_next_stage_async() about per-CPU taskqs.
*/
(void) taskq_dispatch(tq,
} else {
}
}
void
{
dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
}
continue;
/*
* For performance, we'll probably want two sets of task queues:
* per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
* part is for read performance: since we have to make a pass over
* the data to checksum it anyway, we want to do this on the same CPU
* that issued the read, because (assuming CPU scheduling affinity)
* that thread is probably still there. Getting this optimization
* right avoids performance-hostile cache-to-cache transfers.
*
* Note that having two sets of task queues is also necessary for
* correctness: if all of the issue threads get bogged down waiting
* for dependent reads (e.g. metaslab freelist) to complete, then
* there won't be any threads available to service I/O completion
* interrupts.
*/
else
(void) taskq_dispatch(tq,
} else {
}
}
void
zio_resubmit_stage_async(void *arg)
{
}
static boolean_t
{
}
/*
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
{
int error;
return (ENOSPC);
}
/*
* We were passed the previous log block's DVA in bp->blk_dva[0].
* We use that as a hint for which vdev to allocate from next.
*/
if (error)
if (error == 0) {
BP_SET_LEVEL(new_bp, 0);
}
return (error);
}
/*
* Free an intent log block. We know it can't be a gang block, so there's
* nothing to do except metaslab_free() it.
*/
void
{
}
/*
* start an async flush of the write cache for this vdev
*/
void
{
/*
* Lock out configuration changes.
*/
}