zio.c revision fbabab8faf7439009737ccefe9d50152b38c26d1
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
/*
* ==========================================================================
* I/O priority table
* ==========================================================================
*/
0, /* ZIO_PRIORITY_NOW */
0, /* ZIO_PRIORITY_SYNC_READ */
0, /* ZIO_PRIORITY_SYNC_WRITE */
6, /* ZIO_PRIORITY_ASYNC_READ */
4, /* ZIO_PRIORITY_ASYNC_WRITE */
4, /* ZIO_PRIORITY_FREE */
0, /* ZIO_PRIORITY_CACHE_FILL */
0, /* ZIO_PRIORITY_LOG_WRITE */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
};
/*
* ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
"null", "read", "write", "free", "claim", "ioctl" };
/* At or above this size, force gang blocking - for testing */
typedef struct zio_sync_pass {
int zp_defer_free; /* defer frees after this pass */
int zp_dontcompress; /* don't compress after this pass */
int zp_rewrite; /* rewrite new bps after this pass */
1, /* zp_defer_free */
4, /* zp_dontcompress */
1, /* zp_rewrite */
};
/*
* ==========================================================================
* I/O kmem caches
* ==========================================================================
*/
void
zio_init(void)
{
size_t c;
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
* for each quarter-power of 2. For large buffers, we want
* a cache for each multiple of PAGESIZE.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
}
if (align != 0) {
char name[30];
dprintf("creating cache for size %5lx align %5lx\n",
}
}
while (--c != 0) {
}
}
void
zio_fini(void)
{
size_t c;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
}
zio_buf_cache[c] = NULL;
}
}
/*
* ==========================================================================
* Allocate and free I/O buffers
* ==========================================================================
*/
void *
{
}
void
{
}
/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
*/
static void
{
}
static void
{
}
}
static void
{
void *data;
}
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free)
* ==========================================================================
*/
static zio_t *
{
/* XXBP - Need to inherit this when it matters */
zio->io_dva_index = 0;
}
if (!(flags & ZIO_FLAG_CONFIG_HELD))
} else {
if (stage < ZIO_STAGE_READY)
}
return (zio);
}
zio_t *
int flags)
{
return (zio);
}
zio_t *
{
}
zio_t *
{
/*
* Work off our copy of the bp so the caller can free it.
*/
}
if (DVA_GET_GANG(dva)) {
}
return (zio);
}
zio_t *
{
if (compress != ZIO_COMPRESS_OFF)
/* XXX the bp usually (always?) gets re-zeroed later */
}
return (zio);
}
zio_t *
{
/* XXBP - We need to re-evaluate when to insert pipeline stages */
return (zio);
}
static zio_t *
{
return (zio);
}
zio_t *
{
}
/* XXBP - We need to re-evaluate when to insert pipeline stages */
return (zio);
}
zio_t *
{
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
* immediate writes contain committed data, but in a txg that was
* *not* committed. Upon opening the pool after an unclean shutdown,
* the intent log claims all blocks that contain immediate write data
* so that the SPA knows they're in use.
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
*/
/* XXBP - We need to re-evaluate when to insert pipeline stages */
return (zio);
}
zio_t *
{
int c;
if (vd->vdev_children == 0) {
} else {
for (c = 0; c < vd->vdev_children; c++)
}
return (zio);
}
static void
int checksum)
{
if (checksum != ZIO_CHECKSUM_OFF)
}
zio_t *
{
/*
* Work off our copy of the bp so the caller can free it.
*/
return (zio);
}
zio_t *
{
void *wbuf;
/*
* zbt checksums are necessarily destructive -- they modify
* Therefore, we must make a local copy in case the data is
* being written to multiple places.
*/
}
return (zio);
}
/*
* Create a child I/O to do some work for us. It has no associated bp.
*/
zio_t *
{
/*
* If we have the bp, then the child should perform the
* checksum and the parent need not. This pushes error
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
}
return (cio);
}
/*
* ==========================================================================
* Initiate I/O, either sync or async
* ==========================================================================
*/
int
{
int error;
return (error);
}
void
{
}
/*
* ==========================================================================
* ==========================================================================
*/
static void
{
if (*countp == 0) {
} else {
}
}
static void
{
pio->io_stalled = 0;
} else {
}
}
static void
{
}
void
{
}
static void
{
}
static void
{
char blkbuf[BP_SPRINTF_LEN];
}
dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n",
}
dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n",
"partial write",
}
panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
}
}
} else {
}
}
/*
* ==========================================================================
* Compression support
* ==========================================================================
*/
static void
{
void *cbuf;
int pass;
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
* converge, it must eventually be the case that we don't
* have to allocate new blocks. But compression changes
* the blocksize, which forces a reallocate, and makes
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
} else {
pass = 1;
}
if (compress != ZIO_COMPRESS_OFF)
/*
* The final pass of spa_sync() must be all rewrites, but the first
* few passes offer a trade-off: allocating blocks defers convergence,
* but newly allocated blocks are sequential, so they can be written
* to disk faster. Therefore, we allow the first few passes of
* spa_sync() to reallocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
} else {
}
if (csize == 0) {
} else {
}
}
}
static void
{
void *data;
}
/*
* ==========================================================================
* Gang block support
* ==========================================================================
*/
static void
{
/*
* By default, the pipeline assumes that we're dealing with a gang
* block. If we're not, strip out any gang-specific stages.
*/
}
static void
{
}
static void
{
}
static void
{
int i;
ASSERT(i < SPA_GBH_NBLKPTRS);
}
}
static void
{
int i;
ASSERT(i < SPA_GBH_NBLKPTRS);
}
}
static void
{
int i;
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
if (BP_IS_HOLE(gbp))
continue;
}
}
static void
{
int i;
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
if (BP_IS_HOLE(gbp))
continue;
}
}
static void
{
/* XXBP - Need to be careful here with multiple DVAs */
}
static void
{
int error;
int i;
panic("can't allocate gang block header");
if (error == 0)
break;
if (maxalloc == SPA_MINBLOCKSIZE)
panic("really out of space");
}
} else {
}
}
}
/*
* ==========================================================================
* Allocate and free blocks
* ==========================================================================
*/
static void
{
int error;
/* For testing, make some blocks above a certain size be gang blocks */
return;
}
if (error == 0) {
panic("really, truly out of space");
return;
} else {
}
}
static void
{
}
static void
{
}
static void
{
}
/*
* ==========================================================================
* Read and write to physical devices
* ==========================================================================
*/
static void
{
}
static void
{
}
static void
zio_vdev_io_retry(void *vdarg)
{
/* XXPOLICY */
dprintf("async retry #%d for I/O to %s offset %llx\n",
}
}
static void
{
/* XXPOLICY */
}
}
static void
{
/* zio_next_stage_async() gets called from io completion interrupt */
}
static void
{
}
/* XXPOLICY */
static boolean_t
{
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_TRUE);
}
static void
{
/*
* If the I/O failed, determine whether we should attempt to retry it.
*/
/* XXPOLICY */
if (zio_should_retry(zio)) {
zio->io_retries++;
/* XXPOLICY */
dprintf("retry #%d for %s to %s offset %llx\n",
/*
* If this is the first retry, do it immediately.
*/
/* XXPOLICY */
return;
}
/*
* This was not the first retry, so go through the
* longer enqueue/delay/vdev_reopen() process.
*/
(void) taskq_dispatch(
return;
}
}
void
{
}
void
{
}
void
{
}
/*
* ==========================================================================
* Generate and verify checksums
* ==========================================================================
*/
static void
{
}
static void
{
}
static void
{
dprintf("bad checksum on vdev %s\n",
}
}
}
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
void
{
}
/*
* Set the external verifier for a gang block based on stuff in the bp
*/
void
{
}
/*
* ==========================================================================
* Define the pipeline
* ==========================================================================
*/
static void
{
}
};
/*
* Move an I/O to the next stage of the pipeline and execute that stage.
* There's no locking on io_stage because there's no legitimate way for
* multiple threads to be attempting to process the same I/O.
*/
void
{
dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
}
continue;
}
void
{
dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
}
continue;
/*
* For performance, we'll probably want two sets of task queues:
* per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
* part is for read performance: since we have to make a pass over
* the data to checksum it anyway, we want to do this on the same CPU
* that issued the read, because (assuming CPU scheduling affinity)
* that thread is probably still there. Getting this optimization
* right avoids performance-hostile cache-to-cache transfers.
*
* Note that having two sets of task queues is also necessary for
* correctness: if all of the issue threads get bogged down waiting
* for dependent reads (e.g. metaslab freelist) to complete, then
* there won't be any threads available to service I/O completion
* interrupts.
*/
else
(void) taskq_dispatch(tq,
} else {
}
}
/*
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
{
int error;
if (error == 0) {
BP_SET_LEVEL(bp, 0);
}
return (error);
}
/*
* Free an intent log block. We know it can't be a gang block, so there's
* nothing to do except metaslab_free() it.
*/
void
{
}