zio.c revision e56967561b8c65063074511dae7be2c00d5de858
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
/*
* ==========================================================================
* I/O priority table
* ==========================================================================
*/
0, /* ZIO_PRIORITY_NOW */
0, /* ZIO_PRIORITY_SYNC_READ */
0, /* ZIO_PRIORITY_SYNC_WRITE */
6, /* ZIO_PRIORITY_ASYNC_READ */
4, /* ZIO_PRIORITY_ASYNC_WRITE */
4, /* ZIO_PRIORITY_FREE */
0, /* ZIO_PRIORITY_CACHE_FILL */
0, /* ZIO_PRIORITY_LOG_WRITE */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
};
/*
* ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
"null", "read", "write", "free", "claim", "ioctl" };
/* Force an allocation failure when non-zero */
int zio_write_retry = 1;
int zio_resume_threads = 4;
typedef struct zio_sync_pass {
int zp_defer_free; /* defer frees after this pass */
int zp_dontcompress; /* don't compress after this pass */
int zp_rewrite; /* rewrite new bps after this pass */
1, /* zp_defer_free */
4, /* zp_dontcompress */
1, /* zp_rewrite */
};
/*
* ==========================================================================
* I/O kmem caches
* ==========================================================================
*/
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
/*
* Determine if we are allowed to issue the IO based on the
* pool state. If we must wait then block until we are told
* that we may continue.
*/
} \
}
/*
* An allocation zio is one that either currently has the DVA allocate
* stage set or will have it later in it's lifetime.
*/
#define IO_IS_ALLOCATING(zio) \
void
zio_init(void)
{
size_t c;
#ifdef _KERNEL
#endif
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
* for each quarter-power of 2. For large buffers, we want
* a cache for each multiple of PAGESIZE.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
}
if (align != 0) {
char name[36];
}
}
while (--c != 0) {
}
}
void
zio_fini(void)
{
size_t c;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
}
zio_buf_cache[c] = NULL;
if (zio_data_buf_cache[c] != last_data_cache) {
}
zio_data_buf_cache[c] = NULL;
}
}
/*
* ==========================================================================
* Allocate and free I/O buffers
* ==========================================================================
*/
/*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
* excess / transient data in-core during a crashdump.
*/
void *
{
}
/*
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
* crashdump if the kernel panics. This exists so that we will limit the amount
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
* of kernel heap dumped to disk when the kernel panics)
*/
void *
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
*/
static void
{
}
static void
{
}
}
static void
{
void *data;
}
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free)
* ==========================================================================
*/
static zio_t *
{
}
/*
* Note on config lock:
*
* If CONFIG_HELD is set, then the caller already has the config
* lock, so we don't need it for this io.
*
* We set CONFIG_GRABBED to indicate that we have grabbed the
* config lock on behalf of this io, so it should be released
* in zio_done.
*
* Unless CONFIG_HELD is set, we will grab the config lock for
* any top-level (parent-less) io, *except* NULL top-level ios.
* The NULL top-level ios rarely have any children, so we delay
* grabbing the lock until the first child is added (but it is
* still grabbed on behalf of the top-level i/o, so additional
* children don't need to also grab it). This greatly reduces
* contention on the config lock.
*/
if (type != ZIO_TYPE_NULL &&
!(flags & ZIO_FLAG_CONFIG_HELD)) {
}
} else {
if (!(flags & ZIO_FLAG_NOBOOKMARK))
}
if (stage < ZIO_STAGE_READY)
}
/*
* Save off the original state incase we need to retry later.
*/
return (zio);
}
static void
{
}
zio_t *
int flags)
{
return (zio);
}
zio_t *
{
}
zio_t *
{
/*
* then attempt to satisfy the read.
*/
/*
* Work off our copy of the bp so the caller can free it.
*/
return (zio);
}
zio_t *
{
/* XXX the bp usually (always?) gets re-zeroed later */
} else {
/* Make sure someone doesn't change their mind on overwrites */
}
return (zio);
}
zio_t *
{
return (zio);
}
static void
{
/* Free up the previous block */
}
}
static zio_t *
{
return (zio);
}
zio_t *
{
}
return (zio);
}
zio_t *
{
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
* immediate writes contain committed data, but in a txg that was
* *not* committed. Upon opening the pool after an unclean shutdown,
* the intent log claims all blocks that contain immediate write data
* so that the SPA knows they're in use.
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
*/
return (zio);
}
zio_t *
{
int c;
if (vd->vdev_children == 0) {
} else {
for (c = 0; c < vd->vdev_children; c++)
}
return (zio);
}
static void
{
#ifdef ZFS_DEBUG
if (labels) {
}
#endif
if (checksum != ZIO_CHECKSUM_OFF)
}
zio_t *
{
/*
* Work off our copy of the bp so the caller can free it.
*/
return (zio);
}
zio_t *
{
void *wbuf;
/*
* zbt checksums are necessarily destructive -- they modify
* Therefore, we must make a local copy in case the data is
* being written to multiple places.
*/
}
return (zio);
}
/*
* Create a child I/O to do some work for us. It has no associated bp.
*/
zio_t *
{
/*
* If we have the bp, then the child should perform the
* checksum and the parent need not. This pushes error
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
}
return (cio);
}
/*
* ==========================================================================
* Initiate I/O, either sync or async
* ==========================================================================
*/
static void
{
zio->io_failed_vds_count = 0;
}
}
int
{
int error;
return (error);
}
void
{
}
void
{
}
static int
{
return (ZIO_PIPELINE_STOP);
}
/*
* ==========================================================================
* ==========================================================================
*/
static int
{
int rv = ZIO_PIPELINE_CONTINUE;
if (*countp != 0) {
}
return (rv);
}
static void
{
int i;
return;
for (i = 0; i < oldcount; i++) {
return;
}
}
}
static void
{
}
pio->io_stalled = 0;
} else {
}
}
int
{
&zio->io_children_notready));
}
int
{
&zio->io_children_notdone));
}
static int
{
}
if (BP_IS_GANG(bp)) {
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
/*
* Preserve the failed bp so that the io_ready() callback can
* update the accounting accordingly. The callback will also be
* responsible for freeing the previously allocated block, if one
* exists.
*/
/*
* We must zero out the old DVA and blk_birth before reallocating
* the bp.
*/
if (pio) {
/*
* Let the parent know that we will
* re-alloc the write (=> new bp info).
*/
/*
* If the parent I/O is still in the open stage, then
* don't bother telling it to retry since it hasn't
* progressed far enough for it to care.
*/
}
/*
* We are getting ready to process the retry request so clear
* the flag and the zio's current error status.
*/
return (ZIO_PIPELINE_CONTINUE);
}
int
{
/*
* Probe all of vdevs that have experienced an I/O error.
* If we are still unable to verify the integrity of the vdev
* then we prevent the resume from proceeeding.
*/
int error = 0;
continue;
if (error) {
return (error);
}
}
/*
* Clear the vdev stats so that I/O can flow.
*/
/*
* If we are resuming an allocating I/O then we force it
* to retry and let it resume operation where it left off.
* Otherwise, go back to the ready stage and pick up from
* there.
*/
} else {
}
}
/*
* Wait for the taskqs to finish and recheck the pool state since
* it's possible that a resumed I/O has failed again.
*/
return (EIO);
return (0);
}
static int
{
/*
* We've experienced an unrecoverable failure so
* set the pool state accordingly and queue all
* failed IOs.
*/
#ifndef _KERNEL
/* Used to notify ztest that the pool has suspended */
#endif
return (ZIO_PIPELINE_STOP);
}
static void
{
char *blkbuf;
#ifdef ZFS_DEBUG
if (blkbuf) {
}
if (blkbuf)
#endif
fm_panic("Pool '%s' has encountered an uncorrectable I/O "
"failure and the failure mode property for this pool "
}
}
static int
{
}
}
/*
* Some child I/O has indicated that a retry is necessary, so
* we set an error on the I/O and let the logic below do the
* rest.
*/
/*
* If this I/O is attached to a particular vdev,
* generate an error message describing the I/O failure
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
/*
* For root I/O requests, tell the SPA to log the error
* appropriately. Also, generate a logical data
* ereport.
*/
0, 0);
}
/*
* If we are an allocating I/O then we attempt to reissue
* the I/O on another vdev unless the pool is out of space.
* We handle this condition based on the spa's failmode
* property.
*/
return (zio_vdev_retry_io(zio));
/*
* For I/O requests that cannot fail, we carry out
* the requested behavior based on the failmode pool
* property.
*
* XXX - Need to differentiate between an ENOSPC as
* a result of vdev failures vs. a full pool.
*/
int i;
for (i = 0; i < zio->io_failed_vds_count; i++) {
zio->io_failed_vds[i]);
}
if (zio->io_failed_vds_count == 0) {
}
sizeof (vdev_t *));
zio->io_failed_vds_count = 0;
}
return (zio_vdev_suspend_io(zio));
}
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
}
/*
* Note: this I/O is now done, and will shortly be freed, so there is no
* need to clear this (or any other) flag.
*/
} else {
}
return (ZIO_PIPELINE_STOP);
}
/*
* ==========================================================================
* Compression support
* ==========================================================================
*/
static int
{
void *cbuf;
int pass;
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
* converge, it must eventually be the case that we don't
* have to allocate new blocks. But compression changes
* the blocksize, which forces a reallocate, and makes
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
} else {
pass = 1;
}
if (compress != ZIO_COMPRESS_OFF)
/*
* The final pass of spa_sync() must be all rewrites, but the first
* few passes offer a trade-off: allocating blocks defers convergence,
* but newly allocated blocks are sequential, so they can be written
* to disk faster. Therefore, we allow the first few passes of
* spa_sync() to reallocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
} else {
if (csize == 0) {
} else {
}
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
void *data;
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Gang block support
* ==========================================================================
*/
static void
{
}
static int
{
return (zio_wait_for_children_done(zio));
}
static int
{
int i;
ASSERT(i < SPA_GBH_NBLKPTRS);
}
return (zio_wait_for_children_done(zio));
}
static int
{
int i;
ASSERT(i < SPA_GBH_NBLKPTRS);
&zio->io_bookmark));
}
return (zio_wait_for_children_ready(zio));
}
static int
{
int i;
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
if (BP_IS_HOLE(gbp))
continue;
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
int i;
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
if (BP_IS_HOLE(gbp))
continue;
}
return (ZIO_PIPELINE_CONTINUE);
}
static void
{
int d;
}
}
static int
{
int error;
int i, d;
B_FALSE);
if (error) {
return (ZIO_PIPELINE_CONTINUE);
}
for (d = 0; d < gbh_ndvas; d++)
if (error == 0)
break;
/* XXX - free up previous allocations? */
if (maxalloc == SPA_MINBLOCKSIZE) {
return (ZIO_PIPELINE_CONTINUE);
}
}
&zio->io_bookmark));
} else {
}
}
/*
* As much as we'd like this to be 'ready' instead of 'done',
* updating our ASIZE doesn't happen until the io_done callback,
* so we have to wait for that to finish in order for our BP
* to be stable.
*/
return (zio_wait_for_children_done(zio));
}
/*
* ==========================================================================
* Allocate and free blocks
* ==========================================================================
*/
static int
{
int error;
/*
* writes.
*/
if (zio_io_fail_shift &&
if (error == 0) {
} else {
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Read and write to physical devices
* ==========================================================================
*/
static int
{
/*
* If the pool is already in a failure state then just suspend
* this IO until the problem is resolved. We will reissue them
* at that time.
*/
return (zio_vdev_suspend_io(zio));
/*
* The mirror_ops handle multiple DVAs in a single BP
*/
}
}
}
}
static int
{
}
/* XXPOLICY */
{
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
if (vd->vdev_is_failing)
return (B_FALSE);
}
return (B_FALSE);
if (zio->io_retries > 0)
return (B_FALSE);
return (B_TRUE);
}
static int
{
void *abuf;
}
/*
* If the I/O failed, determine whether we should attempt to retry it.
*/
/* XXPOLICY */
if (zio_should_retry(zio)) {
zio->io_retries++;
/* XXPOLICY */
return (ZIO_PIPELINE_CONTINUE);
}
return (ZIO_PIPELINE_CONTINUE);
}
void
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Generate and verify checksums
* ==========================================================================
*/
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
return (ZIO_PIPELINE_CONTINUE);
}
static int
{
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
void
{
}
/*
* Set the external verifier for a gang block based on stuff in the bp
*/
void
{
}
/*
* ==========================================================================
* Define the pipeline
* ==========================================================================
*/
NULL,
};
/*
* Execute the I/O pipeline until one of the following occurs:
* (1) the I/O completes; (2) the pipeline stalls waiting for
* for an I/O completion interrupt; (4) the I/O is delegated by
* vdev-level caching or aggregation; (5) the I/O is deferred
* due to vdev-level queueing; (6) the I/O is handed off to
* another thread. In all cases, the pipeline stops whenever
* there's no CPU work; it never burns a thread in cv_wait().
*
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
*/
void
{
int rv;
/*
* If an error occurred outside the vdev stack,
* just execute the interlock stages to clean up.
*/
continue;
if (rv == ZIO_PIPELINE_STOP)
return;
}
}
static boolean_t
{
}
/*
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
{
int error;
return (ENOSPC);
}
/*
* We were passed the previous log block's DVA in bp->blk_dva[0].
* We use that as a hint for which vdev to allocate from next.
*/
if (error)
if (error == 0) {
BP_SET_LEVEL(new_bp, 0);
}
return (error);
}
/*
* Free an intent log block. We know it can't be a gang block, so there's
* nothing to do except metaslab_free() it.
*/
void
{
}
/*
* start an async flush of the write cache for this vdev
*/
void
{
}