ztest.c revision 095bcd6622e3b3520eb3b71039a3be5cfab25b74
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* that runs entirely in userland, is easy to use, and easy to extend.
*
* The overall design of the ztest program is as follows:
*
* (1) For each major functional area (e.g. adding vdevs to a pool,
* creating and destroying datasets, reading and writing objects, etc)
* we have a simple routine to test that functionality. These
* individual routines do not have to do anything "stressful".
*
* (2) We turn these simple functionality tests into a stress test by
* running them all in parallel, with as many threads as desired,
* and spread across as many datasets, objects, and vdevs as desired.
*
* (3) While all this is happening, we inject faults into the pool to
* verify that self-healing data really works.
*
* (4) Every time we open a dataset, we change its checksum and compression
* functions. Thus even individual objects vary from block to block
* in which checksum they use and whether they're compressed.
*
* (5) To verify that we never lose on-disk consistency after a crash,
* we run the entire test in a child of the main process.
* At random times, the child self-immolates with a SIGKILL.
* This is the software equivalent of pulling the power cord.
* The parent then runs the test again, using the existing
* storage pool, as many times as desired.
*
* (6) To verify that we don't have future leaks or temporal incursions,
* many of the functional tests record the transaction group number
* as part of their data. When reading old data, they verify that
* the transaction group number is less than the current, open txg.
* If you add a new test, please do this if applicable.
*
* When run with no arguments, ztest runs for about five minutes and
* produces no output if successful. To get a little bit of information,
* specify -V. To get more information, specify -VV, and so on.
*
* To turn this into an overnight stress test, use -T to specify run time.
*
* You can ask more more vdevs [-v], datasets [-d], or threads [-t]
* to increase the pool capacity, fanout, and overall stress level.
*
* The -N(okill) option will suppress kills, so each child runs to completion.
* This can be useful when you're trying to distinguish temporal incursions
* from plain old race conditions.
*/
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/resource.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/refcount.h>
#include <stdio.h>
#include <stdio_ext.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <umem.h>
#include <dlfcn.h>
#include <ctype.h>
#include <math.h>
static char cmdname[] = "ztest";
static uint64_t zopt_vdevtime;
static int zopt_ashift = SPA_MINBLOCKSHIFT;
static int zopt_mirrors = 2;
static int zopt_raidz = 4;
static int zopt_raidz_parity = 1;
static int zopt_datasets = 7;
static int zopt_threads = 23;
static int zopt_verbose = 0;
static int zopt_init = 1;
static char *zopt_dir = "/tmp";
static int zopt_maxfaults;
typedef struct ztest_block_tag {
typedef struct ztest_args {
char za_pool[MAXNAMELEN];
/*
* Thread-local variables can go here to aid debugging.
*/
} ztest_args_t;
typedef void ztest_func_t(ztest_args_t *);
/*
* Note: these aren't static because we want dladdr() to work.
*/
typedef struct ztest_info {
} ztest_info_t;
ztest_info_t ztest_info[] = {
};
#define ZTEST_SYNC_LOCKS 16
/*
* The following struct is used to hold a list of uncalled commit callbacks.
*
* The callbacks are ordered by txg number.
*/
typedef struct ztest_cb_list {
/*
* Stuff we need to share writably between parent and child.
*/
typedef struct ztest_shared {
static char ztest_dev_template[] = "%s/%s.%llua";
static char ztest_aux_template[] = "%s/%s.%s.%llu";
static ztest_shared_t *ztest_shared;
static int ztest_random_fd;
static int ztest_dump_core = 1;
static uint64_t metaslab_sz;
static boolean_t ztest_exiting;
/* Global commit callback list */
static ztest_cb_list_t zcl;
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
#define ZTEST_DIROBJ 1
#define ZTEST_MICROZAP_OBJ 2
#define ZTEST_FATZAP_OBJ 3
#define ZTEST_DIRSIZE 256
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
* debugging facilities.
*/
const char *
{
return ("default,verbose"); /* $UMEM_DEBUG setting */
}
const char *
_umem_logging_init(void)
{
return ("fail,contents"); /* $UMEM_LOGGING setting */
}
#define FATAL_MSG_SZ 1024
char *fatal_msg;
static void
{
int save_errno = errno;
char buf[FATAL_MSG_SZ];
/* LINTED */
if (do_perror) {
}
if (ztest_dump_core)
abort();
exit(3);
}
static int
{
const char *ends = "BKMGTPEZ";
int i;
if (buf[0] == '\0')
return (0);
break;
}
buf);
}
return (10*i);
}
/* NOTREACHED */
}
static uint64_t
nicenumtoull(const char *buf)
{
char *end;
} else if (end[0] == '.') {
if (fval > UINT64_MAX) {
buf);
}
} else {
buf);
}
}
return (val);
}
static void
{
char nice_vdev_size[10];
char nice_gang_bang[10];
"\t[-v vdevs (default: %llu)]\n"
"\t[-s size_of_each_vdev (default: %s)]\n"
"\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
"\t[-m mirror_copies (default: %d)]\n"
"\t[-r raidz_disks (default: %d)]\n"
"\t[-R raidz_parity (default: %d)]\n"
"\t[-d datasets (default: %d)]\n"
"\t[-t threads (default: %d)]\n"
"\t[-g gang_block_threshold (default: %s)]\n"
"\t[-i initialize pool i times (default: %d)]\n"
"\t[-k kill percentage (default: %llu%%)]\n"
"\t[-p pool_name (default: %s)]\n"
"\t[-f file directory for vdev files (default: %s)]\n"
"\t[-V(erbose)] (use multiple times for ever more blather)\n"
"\t[-E(xisting)] (use existing pool instead of creating new one)\n"
"\t[-T time] total run time (default: %llu sec)\n"
"\t[-P passtime] time per pass (default: %llu sec)\n"
"\t[-h] (print help)\n"
"",
nice_vdev_size, /* -s */
zopt_ashift, /* -a */
zopt_mirrors, /* -m */
zopt_raidz, /* -r */
zopt_raidz_parity, /* -R */
zopt_datasets, /* -d */
zopt_threads, /* -t */
nice_gang_bang, /* -g */
zopt_init, /* -i */
zopt_pool, /* -p */
zopt_dir, /* -f */
}
static uint64_t
{
uint64_t r;
if (range == 0)
return (0);
if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
return (r % range);
}
/* ARGSUSED */
static void
ztest_record_enospc(char *s)
{
}
static void
{
int opt;
/* By default, test gang blocks for blocks 32K and greater */
"v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:h")) != EOF) {
value = 0;
switch (opt) {
case 'v':
case 's':
case 'a':
case 'm':
case 'r':
case 'R':
case 'd':
case 't':
case 'g':
case 'i':
case 'k':
case 'T':
case 'P':
}
switch (opt) {
case 'v':
zopt_vdevs = value;
break;
case 's':
break;
case 'a':
zopt_ashift = value;
break;
case 'm':
break;
case 'r':
break;
case 'R':
break;
case 'd':
break;
case 't':
break;
case 'g':
break;
case 'i':
break;
case 'k':
break;
case 'p':
break;
case 'f':
break;
case 'V':
zopt_verbose++;
break;
case 'E':
zopt_init = 0;
break;
case 'T':
break;
case 'P':
break;
case 'h':
break;
case '?':
default:
break;
}
}
}
static uint64_t
ztest_get_ashift(void)
{
if (zopt_ashift == 0)
return (zopt_ashift);
}
static nvlist_t *
{
char pathbuf[MAXPATHLEN];
if (ashift == 0)
ashift = ztest_get_ashift();
} else {
}
}
if (size != 0) {
if (fd == -1)
}
return (file);
}
static nvlist_t *
{
int c;
if (r < 2)
for (c = 0; c < r; c++)
VDEV_TYPE_RAIDZ) == 0);
zopt_raidz_parity) == 0);
child, r) == 0);
for (c = 0; c < r; c++)
nvlist_free(child[c]);
return (raidz);
}
static nvlist_t *
int r, int m)
{
int c;
if (m < 1)
for (c = 0; c < m; c++)
VDEV_TYPE_MIRROR) == 0);
child, m) == 0);
for (c = 0; c < m; c++)
nvlist_free(child[c]);
return (mirror);
}
static nvlist_t *
int log, int r, int m, int t)
{
int c;
ASSERT(t > 0);
for (c = 0; c < t; c++) {
log) == 0);
}
child, t) == 0);
for (c = 0; c < t; c++)
nvlist_free(child[c]);
return (root);
}
static void
{
int bs = SPA_MINBLOCKSHIFT +
int ibs = DN_MIN_INDBLKSHIFT +
int error;
if (error) {
char osname[300];
fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
}
}
static uint8_t
ztest_random_checksum(void)
{
do {
if (checksum == ZIO_CHECKSUM_OFF)
return (checksum);
}
static uint8_t
ztest_random_compress(void)
{
}
static int
{
int error;
if (byteswap)
if (error) {
return (error);
}
DMU_OT_NONE, 0, tx);
if (zopt_verbose >= 5) {
char osname[MAXNAMELEN];
(void) printf("replay create of %s object %llu"
" in txg %llu = %d\n",
}
return (error);
}
static int
{
int error;
if (byteswap)
if (error) {
return (error);
}
return (error);
}
NULL, /* 0 no such transaction type */
ztest_replay_create, /* TX_CREATE */
NULL, /* TX_MKDIR */
NULL, /* TX_MKXATTR */
NULL, /* TX_SYMLINK */
ztest_replay_remove, /* TX_REMOVE */
NULL, /* TX_RMDIR */
NULL, /* TX_LINK */
NULL, /* TX_RENAME */
NULL, /* TX_WRITE */
NULL, /* TX_TRUNCATE */
NULL, /* TX_SETATTR */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
NULL, /* TX_CREATE_ACL_ATTR */
NULL, /* TX_MKDIR_ACL */
NULL, /* TX_MKDIR_ATTR */
NULL, /* TX_MKDIR_ACL_ATTR */
NULL, /* TX_WRITE2 */
};
/*
* Verify that we can't destroy an active pool, create an existing pool,
* or create a pool with a bad vdev spec.
*/
void
{
int error;
/*
* Attempt to create using a bad file.
*/
/*
* Attempt to create using a bad mirror.
*/
/*
* Attempt to create an existing pool. It shouldn't matter
* what's in the nvroot; we should fail with EEXIST.
*/
if (error)
}
static vdev_t *
{
return (vd);
for (int c = 0; c < vd->vdev_children; c++)
NULL)
return (mvd);
return (NULL);
}
/*
* Find the first available hole which can be used as a top-level.
*/
int
{
int c;
for (c = 0; c < rvd->vdev_children; c++) {
if (cvd->vdev_ishole)
break;
}
return (c);
}
/*
* Verify that vdev_add() works as expected.
*/
void
{
int error;
/*
* If we have slogs then remove them 1/4 of the time.
*/
/*
* Grab the guid from the head of the log class rotor.
*/
/*
* We have to grab the zs_name_lock as writer to
* prevent a race between removing a slog (dmu_objset_find)
* and destroying a dataset. Removing the slog will
* grab a reference on the dataset which may cause
* dmu_objset_destroy() to fail with EBUSY thus
* leaving the dataset in an inconsistent state.
*/
} else {
/*
* Make 1/4 of the devices be log devices.
*/
ztest_record_enospc("spa_vdev_add");
else if (error != 0)
}
}
/*
*/
void
{
char *aux;
int error;
if (ztest_random(2) == 0) {
} else {
}
/*
* Pick a random device to remove.
*/
} else {
/*
* Find an unused device we can add.
*/
ztest_shared->zs_vdev_aux = 0;
for (;;) {
char path[MAXPATHLEN];
int c;
path) == 0)
break;
break;
}
}
if (guid == 0) {
/*
* Add a new device.
*/
if (error != 0)
} else {
/*
* Remove an existing device. Sometimes, dirty its
* vdev state first to make sure we handle removal
* of devices that have pending state changes.
*/
if (ztest_random(2) == 0)
}
}
/*
* Verify that we can attach and detach devices.
*/
void
{
int replacing;
int oldvd_has_siblings = B_FALSE;
int newvd_is_spare = B_FALSE;
int oldvd_is_log;
int error, expected_error;
/*
* Decide whether to do an attach or a replace.
*/
/*
* Pick a random top-level vdev.
*/
/*
* Pick a random leaf within it.
*/
/*
* Locate this vdev.
*/
if (zopt_mirrors >= 1) {
}
if (zopt_raidz > 1) {
}
/*
* If we're already doing an attach or replace, oldvd may be a
* mirror vdev -- in which case, pick a random child.
*/
while (oldvd->vdev_children != 0) {
}
/*
* If oldvd has siblings, then half of the time, detach it.
*/
return;
}
/*
* For the new vdev, choose with equal probability between the two
* standard paths (ending in either 'a' or 'b') or a random hot spare.
*/
} else {
if (ztest_random(2) == 0)
}
if (newvd) {
} else {
/*
* Make newsize a little bigger or smaller than oldsize.
* If it's smaller, the attach should fail.
* If it's larger, and we're doing a replace,
* we should get dynamic LUN growth when we're done.
*/
}
/*
* If pvd is not a mirror or root, the attach should fail with ENOTSUP,
* unless it's a replace; in that case any non-replacing parent is OK.
*
* If newvd is already part of the pool, it should fail with EBUSY.
*
* If newvd is too small, it should fail with EOVERFLOW.
*/
else
expected_error = 0;
/*
* Build the nvlist describing newpath.
*/
ashift, 0, 0, 0, 1);
/*
* If our parent was the replacing vdev, but the replace completed,
* then instead of failing with ENOTSUP we may either succeed,
* fail with ENODEV, or fail with EOVERFLOW.
*/
if (expected_error == ENOTSUP &&
/*
* If someone grew the LUN, the replacement may be too small.
*/
/* XXX workaround 6690467 */
fatal(0, "attach (%s %llu, %s %llu, %d) "
"returned %d, expected %d",
}
}
/*
* Callback function which expands the physical size of the vdev.
*/
vdev_t *
{
int fd;
return (vd);
if (zopt_verbose >= 6) {
(void) printf("%s grew from %lu to %lu bytes\n",
}
return (NULL);
}
/*
* Callback function which expands a given vdev by calling vdev_online().
*/
/* ARGSUSED */
vdev_t *
{
int error;
/* Calling vdev_online will initialize the new metaslabs */
/*
* If vdev_online returned an error or the underlying vdev_open
* failed then we abort the expand. The only way to know that
* vdev_open fails is by checking the returned newstate.
*/
if (zopt_verbose >= 5) {
(void) printf("Unable to expand vdev, state %llu, "
}
return (vd);
}
/*
* Since we dropped the lock we need to ensure that we're
* still talking to the original vdev. It's possible this
* trying to online it.
*/
if (zopt_verbose >= 5) {
(void) printf("vdev configuration has changed, "
"guid %llu, state %llu, expected gen %llu, "
}
return (vd);
}
return (NULL);
}
/*
* Traverse the vdev tree calling the supplied function.
* We continue to walk the tree until we either have walked all
* children or we receive a non-NULL return from the callback.
* If a NULL callback is passed, then we just return back the first
* leaf vdev we encounter.
*/
vdev_t *
{
return (vd);
else
}
return (cvd);
}
return (NULL);
}
/*
* Verify that dynamic LUN growth works as expected.
*/
void
{
}
/*
* Determine the size of the first leaf vdev associated with
* our top-level device.
*/
/*
* We only try to expand the vdev if it's healthy, less than 4x its
* original size, and it has a valid psize.
*/
return;
}
if (zopt_verbose >= 5) {
(void) printf("Expanding vdev %s from %lu to %lu\n",
}
/*
* Growing the vdev is a two step process:
* 1). expand the physical size (i.e. relabel)
* 2). online the vdev to create the new metaslabs
*/
if (zopt_verbose >= 5) {
(void) printf("Could not expand LUN because "
"the vdev configuration changed.\n");
}
return;
}
/*
* Expanding the LUN will update the config asynchronously,
* thus we must wait for the async thread to complete any
* pending tasks before proceeding.
*/
/*
* Make sure we were able to grow the pool.
*/
spa_cursize >= spa_newsize) {
(void) printf("Top-level vdev metaslab count: "
"before %llu, after %llu\n",
fatal(0, "LUN expansion failed: before %llu, "
} else if (zopt_verbose >= 5) {
(void) printf("%s grew from %s to %s\n",
}
}
/* ARGSUSED */
static void
{
/*
* Create the directory object.
*/
}
static int
{
int error;
/*
* Verify that the dataset contains a directory object.
*/
/* We could have crashed in the middle of destroying it */
}
/*
* Destroy the dataset.
*/
if (error) {
}
return (0);
}
/*
* Verify that dmu_objset_{create,destroy,open,close} work as expected.
*/
static uint64_t
{
char name[24];
}
void
{
int error;
char name[100];
/*
* If this dataset exists from a previous run, process its replay log
* half of the time. If we don't replay it, then dmu_objset_destroy()
* (invoked from ztest_destroy_cb() below) should just throw it away.
*/
if (ztest_random(2) == 0 &&
}
/*
* There may be an old instance of the dataset we're about to
* create lying around from a previous run. If so, destroy it
* and all of its snapshots.
*/
/*
* Verify that the destroyed dataset is no longer in the namespace.
*/
/*
* Verify that we can create a new dataset.
*/
if (error) {
ztest_record_enospc("dmu_objset_create");
return;
}
}
if (error) {
}
/*
* Open the intent log for it.
*/
/*
* Put a random number of objects in there.
*/
seq = 0;
while (objects-- != 0) {
if (error) {
} else {
DMU_OT_NONE, 0, tx);
}
if (ztest_random(5) == 0) {
}
if (ztest_random(100) == 0) {
if (error == 0) {
}
}
}
/*
* Verify that we cannot create an existing dataset.
*/
/*
* Verify that we can hold an objset that is also owned.
*/
if (error)
/*
* Verify that we can not own an objset that is already owned.
*/
fatal(0, "dmu_objset_open('%s') = %d, expected EBUSY",
if (error)
}
/*
* Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
*/
void
{
int error;
char snapname[100];
char osname[MAXNAMELEN];
ztest_record_enospc("dmu_take_snapshot");
}
/*
* Cleanup non-standard snapshots and clones.
*/
void
{
char snap1name[100];
char clone1name[100];
char snap2name[100];
char clone2name[100];
char snap3name[100];
int error;
}
/*
* Verify dsl_dataset_promote handles EBUSY
*/
void
{
int error;
char snap1name[100];
char clone1name[100];
char snap2name[100];
char clone2name[100];
char snap3name[100];
char osname[MAXNAMELEN];
ztest_record_enospc("dmu_take_snapshot");
goto out;
}
}
if (error)
if (error) {
ztest_record_enospc("dmu_objset_create");
goto out;
}
}
ztest_record_enospc("dmu_take_snapshot");
goto out;
}
}
ztest_record_enospc("dmu_take_snapshot");
goto out;
}
}
if (error)
if (error) {
ztest_record_enospc("dmu_objset_create");
goto out;
}
}
if (error)
error);
out:
}
/*
* Verify that dmu_object_{alloc,free} work as expected.
*/
void
{
char osname[MAXNAMELEN];
endoff = -8ULL;
batchsize = 2;
/*
* Create a batch object if necessary, and record it in the directory.
*/
if (batchobj == 0) {
sizeof (uint64_t));
if (error) {
ztest_record_enospc("create a batch object");
return;
}
DMU_OT_NONE, 0, tx);
}
/*
* Destroy the previous batch of objects.
*/
for (b = 0; b < batchsize; b++) {
if (object == 0)
continue;
/*
* Read and validate contents.
* We expect the nth byte of the bonus buffer to be n.
*/
for (c = 0; c < bonuslen; c++) {
fatal(0,
"bad bonus: %s, obj %llu, off %d: %u != %u",
}
}
/*
* We expect the word at endoff to be our object number.
*/
fatal(0, "bad data in %s, got %llu, expected %llu",
}
/*
* Destroy old object and clear batch entry.
*/
if (error) {
ztest_record_enospc("free object");
return;
}
if (error) {
fatal(0, "dmu_object_free('%s', %llu) = %d",
}
object = 0;
ztest_random_checksum(), tx);
ztest_random_compress(), tx);
}
/*
* Before creating the new batch of objects, generate a bunch of churn.
*/
for (b = ztest_random(100); b > 0; b--) {
if (error) {
ztest_record_enospc("churn objects");
return;
}
DMU_OT_NONE, 0, tx);
if (error) {
fatal(0, "dmu_object_free('%s', %llu) = %d",
}
}
/*
* Create a new batch of objects with randomly chosen
* blocksizes and record them in the batch directory.
*/
for (b = 0; b < batchsize; b++) {
sizeof (uint64_t));
sizeof (uint64_t));
if (error) {
ztest_record_enospc("create batchobj");
return;
}
ztest_random_checksum(), tx);
ztest_random_compress(), tx);
/*
* Write to both the bonus buffer and the regular data.
*/
ASSERT3S(va_nblocks, >=, 0);
/*
* See comments above regarding the contents of
* the bonus buffer and the word at endoff.
*/
for (c = 0; c < bonuslen; c++)
/*
* Write to a large offset to increase indirection.
*/
}
}
/*
* Verify that dmu_{read,write} work as expected.
*/
typedef struct bufwad {
} bufwad_t;
typedef struct dmu_read_write_dir {
void
{
int free_percent = 5;
/*
* This test uses two objects, packobj and bigobj, that are always
* updated together (i.e. in the same tx) so that their contents are
* in sync and can be compared. Their contents relate to each other
* in a simple way: packobj is a dense array of 'bufwad' structures,
* while bigobj is a sparse array of the same bufwads. Specifically,
* for any index n, there are three bufwads that should be identical:
*
* packobj, at offset n * sizeof (bufwad_t)
* bigobj, at the head of the nth chunk
* bigobj, at the tail of the nth chunk
*
* The chunk size is arbitrary. It doesn't have to be a power of two,
* and it doesn't have any relation to the object blocksize.
* The only requirement is that it can hold at least two bufwads.
*
* Normally, we write the bufwad to each of these locations.
* However, free_percent of the time we instead write zeroes to
* packobj and perform a dmu_free_range() on bigobj. By comparing
* bigobj to packobj, we can verify that the DMU is correctly
* tracking which parts of an object are allocated and free,
* and that the contents of the allocated blocks are correct.
*/
/*
* Read the directory info. If it's the first time, set things up.
*/
if (error) {
ztest_record_enospc("create r/w directory");
return;
}
DMU_OT_NONE, 0, tx);
DMU_OT_NONE, 0, tx);
tx);
}
/*
* Prefetch a random chunk of the big object.
* Our aim here is to get some async reads in flight
* for blocks that we may free below; the DMU should
* handle this race correctly.
*/
/*
* Pick a random index and compute the offsets into packobj and bigobj.
*/
/*
* free_percent of the time, free a range of bigobj rather than
* overwriting it.
*/
/*
* Read the current contents of our objects.
*/
/*
* Get a tx for the mods to both packobj and bigobj.
*/
if (freeit)
else
if (error) {
ztest_record_enospc("dmu r/w range");
return;
}
/*
* For each index from n to n + s, verify that the existing bufwad
* in packobj matches the bufwads at the head and tail of the
* corresponding chunk in bigobj. Then update all three bufwads
* with the new values we want to write out.
*/
for (i = 0; i < s; i++) {
/* LINTED */
/* LINTED */
/* LINTED */
fatal(0, "future leak: got %llx, open txg is %llx",
fatal(0, "wrong index: got %llx, wanted %llx+%llx",
if (freeit) {
} else {
}
}
/*
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
if (freeit) {
if (zopt_verbose >= 6) {
(void) printf("freeing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)txg);
}
} else {
if (zopt_verbose >= 6) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)txg);
}
}
/*
* Sanity check the stuff we just wrote.
*/
{
}
}
void
{
uint64_t i;
/*
* For each index from n to n + s, verify that the existing bufwad
* in packobj matches the bufwads at the head and tail of the
* corresponding chunk in bigobj. Then update all three bufwads
* with the new values we want to write out.
*/
for (i = 0; i < s; i++) {
/* LINTED */
/* LINTED */
/* LINTED */
fatal(0, "future leak: got %llx, open txg is %llx",
fatal(0, "wrong index: got %llx, wanted %llx+%llx",
}
}
void
{
uint64_t i;
int error;
/*
* This test uses two objects, packobj and bigobj, that are always
* updated together (i.e. in the same tx) so that their contents are
* in sync and can be compared. Their contents relate to each other
* in a simple way: packobj is a dense array of 'bufwad' structures,
* while bigobj is a sparse array of the same bufwads. Specifically,
* for any index n, there are three bufwads that should be identical:
*
* packobj, at offset n * sizeof (bufwad_t)
* bigobj, at the head of the nth chunk
* bigobj, at the tail of the nth chunk
*
* The chunk size is set equal to bigobj block size so that
* dmu_assign_arcbuf() can be tested for object updates.
*/
/*
* Read the directory info. If it's the first time, set things up.
*/
if (error) {
ztest_record_enospc("create r/w directory");
return;
}
DMU_OT_NONE, 0, tx);
DMU_OT_NONE, 0, tx);
tx);
} else {
}
/*
* Pick a random index and compute the offsets into packobj and bigobj.
*/
/*
* Iteration 0 test zcopy for DB_UNCACHED dbufs.
* Iteration 1 test zcopy to already referenced dbufs.
* Iteration 2 test zcopy to dirty dbuf in the same txg.
* Iteration 3 test zcopy to dbuf dirty in previous txg.
* Iteration 4 test zcopy when dbuf is no longer dirty.
* Iteration 5 test zcopy when it can't be done.
* Iteration 6 one more zcopy write.
*/
for (i = 0; i < 7; i++) {
uint64_t j;
/*
* In iteration 5 (i == 5) use arcbufs
* that don't match bigobj blksz to test
* dmu_assign_arcbuf() when it can't directly
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
if (i != 5) {
bigbuf_arcbufs[j] =
} else {
bigbuf_arcbufs[2 * j] =
}
}
/*
* Get a tx for the mods to both packobj and bigobj.
*/
if (ztest_random(100) == 0) {
error = -1;
} else {
}
if (error) {
if (error != -1) {
ztest_record_enospc("dmu r/w range");
}
for (j = 0; j < s; j++) {
if (i != 5) {
} else {
bigbuf_arcbufs[2 * j]);
}
}
return;
}
/*
* 50% of the time don't read objects in the 1st iteration to
* test dmu_assign_arcbuf() for the case when there're no
* existing dbufs for the specified offsets.
*/
if (i != 0 || ztest_random(2) != 0) {
}
/*
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
if (zopt_verbose >= 6) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)txg);
}
if (i != 5) {
} else {
}
if (i == 1) {
}
if (i != 5) {
bigbuf_arcbufs[j], tx);
} else {
}
if (i == 1) {
}
}
/*
* Sanity check the stuff we just wrote.
*/
{
}
if (i == 2) {
} else if (i == 3) {
}
}
}
void
{
/*
* Make sure that, if there is a write record in the bonus buffer
* of the ZTEST_DIROBJ, that the txg for this record is <= the
* last synced txg of the pool.
*/
}
}
void
{
int b, error;
int bs = ZTEST_DIROBJ_BLOCKSIZE;
int do_free = 0;
char osname[MAXNAMELEN];
char iobuf[SPA_MAXBLOCKSIZE];
/*
* Have multiple threads write to large offsets in ZTEST_DIROBJ
* to verify that having multiple threads writing to the same object
* in parallel doesn't cause any trouble.
*/
if (ztest_random(4) == 0) {
/*
* Do the bonus buffer instead of a regular block.
* We need a lock to serialize resize vs. others,
* so we hash on the objset ID.
*/
off = -1ULL;
} else {
b = ztest_random(ZTEST_SYNC_LOCKS);
if (ztest_random(4) == 0) {
do_free = 1;
} else {
}
}
ztest_random(8) == 0) {
}
if (error) {
} else {
ztest_record_enospc("dmu write parallel");
}
}
return;
}
(void) mutex_lock(lp);
/*
* Occasionally, write an all-zero block to test the behavior
* of blocks that compress into holes.
*/
if (off == -1ULL) {
char *dboff;
}
if (ztest_random(10) == 0) {
}
} else if (do_free) {
} else {
}
(void) mutex_unlock(lp);
if (ztest_random(1000) == 0)
if (ztest_random(10000) == 0)
return;
if (ztest_random(2) != 0)
return;
/*
* dmu_sync() the block we just wrote.
*/
(void) mutex_lock(lp);
if (error) {
(void) mutex_unlock(lp);
return;
}
if (error) {
(void) mutex_unlock(lp);
return;
}
(void) mutex_unlock(lp);
return;
}
(void) mutex_unlock(lp);
/*
* Read the block that dmu_sync() returned to make sure its contents
* match what we wrote. We do this while still txg_suspend()ed
* to ensure that the block can't be reused before we read it.
*/
return;
return;
/*
* The semantic of dmu_sync() is that we always push the most recent
* version of the data, so in the face of concurrent updates we may
* see a newer version of the block. That's OK.
*/
else
}
/*
* Verify that zap_{create,destroy,add,remove,update} work as expected.
*/
#define ZTEST_ZAP_MIN_INTS 1
#define ZTEST_ZAP_MAX_INTS 4
#define ZTEST_ZAP_MAX_PROPS 1000
void
{
int i, ints;
int error;
char osname[MAXNAMELEN];
/*
* Create a new object if necessary, and record it in the directory.
*/
if (object == 0) {
sizeof (uint64_t));
if (error) {
ztest_record_enospc("create zap test obj");
return;
}
if (error) {
fatal(0, "zap_create('%s', %llu) = %d",
}
/*
* Generate a known hash collision, and verify that
* we can lookup and remove both entries.
*/
for (i = 0; i < 2; i++) {
value[i] = i;
}
for (i = 0; i < 2; i++) {
&zl_intsize, &zl_ints);
}
for (i = 0; i < 2; i++) {
}
}
last_txg = 0;
/*
* If these zap entries already exist, validate their contents.
*/
if (error == 0) {
&zl_ints) == 0);
for (i = 0; i < ints; i++) {
}
} else {
}
/*
* Atomically update two entries in our zap object.
* The first is named txg_%llu, and contains the txg
* in which the property was last updated. The second
* is named prop_%llu, and the nth element of its value
* should be txg + object + n.
*/
if (error) {
ztest_record_enospc("create zap entry");
return;
}
for (i = 0; i < ints; i++)
if (error)
fatal(0, "zap_update('%s', %llu, '%s') = %d",
if (error)
fatal(0, "zap_update('%s', %llu, '%s') = %d",
/*
* Remove a random pair of entries.
*/
return;
if (error) {
ztest_record_enospc("remove zap entry");
return;
}
if (error)
fatal(0, "zap_remove('%s', %llu, '%s') = %d",
if (error)
fatal(0, "zap_remove('%s', %llu, '%s') = %d",
/*
* Once in a while, destroy the object.
*/
if (ztest_random(1000) != 0)
return;
if (error) {
ztest_record_enospc("destroy zap object");
return;
}
if (error)
fatal(0, "zap_destroy('%s', %llu) = %d",
object = 0;
}
/*
* Testcase to test the upgrading of a microzap to fatzap.
*/
void
{
int i, error;
char osname[MAXNAMELEN];
char *name = "aaa";
char entname[MAXNAMELEN];
/*
* Create a new object if necessary, and record it in the directory.
*/
if (object == 0) {
sizeof (uint64_t));
if (error) {
ztest_record_enospc("create zap test obj");
return;
}
if (error) {
fatal(0, "zap_create('%s', %llu) = %d",
}
}
/*
* Add entries to this ZAP amd make sure it spills over
* and gets upgraded to a fatzap. Also, since we are adding
* 2050 entries we should see ptrtbl growth and leaf-block
* split.
*/
for (i = 0; i < 2050; i++) {
value = i;
if (error) {
ztest_record_enospc("create zap entry");
return;
}
}
/*
* Once in a while, destroy the object.
*/
if (ztest_random(1000) != 0)
return;
if (error) {
ztest_record_enospc("destroy zap object");
return;
}
if (error)
fatal(0, "zap_destroy('%s', %llu) = %d",
object = 0;
}
void
{
void *data;
/*
* Generate a random name of the form 'xxx.....' where each
* x is a random printable character and the dots are dots.
* There are 94 such characters, and the name length goes from
* 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
*/
for (i = 0; i < 3; i++)
for (; i < namelen - 1; i++)
name[i] = '.';
name[i] = '\0';
if (ztest_random(2) == 0)
else
wc = 1;
} else {
wsize = 1;
data = string_value;
}
count = -1ULL;
/*
* Select an operation: length, lookup, add, update, remove.
*/
i = ztest_random(5);
if (i >= 2) {
if (error) {
ztest_record_enospc("zap parallel");
return;
}
} else {
txg = 0;
}
switch (i) {
case 0:
if (error == 0) {
} else {
}
break;
case 1:
if (error == 0) {
if (data == string_value &&
fatal(0, "name '%s' != val '%s' len %d",
} else {
}
break;
case 2:
break;
case 3:
break;
case 4:
break;
}
}
/*
* Commit callback data.
*/
typedef struct ztest_cb_data {
int zcd_expected_err;
/* This is the actual commit callback function */
static void
{
/*
* The private callback data should be destroyed here, but
* since we are going to check the zcd_called field after
* dmu_tx_abort(), we will destroy it there.
*/
return;
}
/* Was this callback added to the global callback list? */
goto out;
/* Remove our callback from the list */
out:
}
/* Allocate and initialize callback data structure */
static ztest_cb_data_t *
{
return (cb_data);
}
/*
* If a number of txgs equal to this threshold have been created after a commit
* callback has been registered but not called, then we assume there is an
* implementation bug.
*/
/*
* Commit callback test.
*/
void
{
int i, error;
/* Every once in a while, abort the transaction on purpose */
if (ztest_random(100) == 0)
error = -1;
if (!error)
if (error) {
/*
* It's not a strict requirement to call the registered
* callbacks from inside dmu_tx_abort(), but that's what
* it's supposed to happen in the current implementation
* so we will check for that.
*/
for (i = 0; i < 2; i++) {
}
for (i = 0; i < 2; i++) {
}
return;
}
/*
* Read existing data to make sure there isn't a future leak.
*/
&old_txg, DMU_READ_PREFETCH));
/*
* Since commit callbacks don't have any ordering requirement and since
* it is theoretically possible for a commit callback to be called
* after an arbitrary amount of time has elapsed since its txg has been
* synced, it is difficult to reliably determine whether a commit
* callback hasn't been called due to high load or due to a flawed
* implementation.
*
* In practice, we will assume that if after a certain number of txgs a
* commit callback hasn't been called, then most likely there's an
* implementation bug..
*/
fatal(0, "Commit callback threshold exceeded, oldest txg: %"
}
/*
* Let's find the place to insert our callbacks.
*
* Even though the list is ordered by txg, it is possible for the
* insertion point to not be the end because our txg may already be
* quiescing at this point and other callbacks in the open txg
* (from other objsets) may have sneaked in.
*/
/* Add the 3 callbacks to the list */
for (i = 0; i < 3; i++) {
else
cb_data[i]);
}
}
void
{
int i, inherit;
char setpoint[MAXPATHLEN];
char osname[MAXNAMELEN];
int error;
for (i = 0; i < 2; i++) {
if (i == 0) {
prop = "checksum";
} else {
prop = "compression";
}
ztest_record_enospc("dsl_prop_set");
break;
}
if (i == 0)
else
if (zopt_verbose >= 6) {
(void) printf("%s %s = %s for '%s'\n",
}
}
}
/*
*/
void
{
int error;
char snapname[100];
char fullname[100];
char clonename[100];
char tag[100];
char osname[MAXNAMELEN];
/*
* Clean up from any previous run.
*/
/*
* Create snapshot, clone it, mark snap for deferred destroy,
* destroy clone, verify snap was also destroyed.
*/
if (error) {
ztest_record_enospc("dmu_objset_snapshot");
goto out;
}
}
if (error)
if (error) {
ztest_record_enospc("dmu_objset_clone");
goto out;
}
}
if (error) {
fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
}
if (error)
/*
* Create snapshot, add temporary hold, verify that we can't
* destroy a held snapshot, mark for deferred destroy,
* release hold, verify snapshot was destroyed.
*/
if (error) {
ztest_record_enospc("dmu_objset_snapshot");
goto out;
}
}
if (error)
fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
}
if (error) {
fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
}
if (error)
out:
}
/*
* Inject random faults into the on-disk data.
*/
void
{
int fd;
char path0[MAXPATHLEN];
char pathrand[MAXPATHLEN];
int iters = 1000;
int maxfaults = zopt_maxfaults;
/*
* We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
*/
if (ztest_random(2) == 0) {
/*
* Inject errors on a normal data device.
*/
/*
* Generate paths to the first leaf in this top-level vdev,
* and to the random leaf we selected. We'll induce transient
* and we'll write random garbage to the randomly chosen leaf.
*/
/*
* Make vd0 explicitly claim to be unreadable,
* or unwriteable, or reach behind its back
* and close the underlying fd. We can do this if
* maxfaults == 0 because we'll fail and reexecute,
* and we can do it if maxfaults >= 2 because we'll
* have enough redundancy. If maxfaults == 1, the
* combination of this with injection of random data
* corruption below exceeds the pool's fault tolerance.
*/
} else if (ztest_random(2) == 0) {
} else {
}
}
} else {
/*
* Inject errors on an l2cache device.
*/
return;
}
leaf = 0;
leaves = 1;
}
/*
* If we can tolerate two or more faults, or we're dealing
*/
ZFS_OFFLINE_TEMPORARY : 0);
/*
* We have to grab the zs_name_lock as writer to
* prevent a race between offlining a slog and
* destroying a dataset. Offlining the slog will
* grab a reference on the dataset which may cause
* dmu_objset_destroy() to fail with EBUSY thus
* leaving the dataset in an inconsistent state.
*/
if (islog)
if (islog)
} else {
}
}
if (maxfaults == 0)
return;
/*
* We have at least single-fault tolerance, so inject data corruption.
*/
return;
while (--iters != 0) {
continue;
if (zopt_verbose >= 6)
(void) printf("injecting bad word into %s,"
}
}
/*
* Scrub the pool.
*/
void
{
}
/*
* Rename the pool to a different name and then rename it back.
*/
void
{
int error;
/*
* Do the rename
*/
if (error)
/*
* Try to open it under the old name, which shouldn't exist
*/
/*
* Open it under the new name and make sure it's still the same spa_t.
*/
if (error != 0)
/*
* Rename it back to the original
*/
if (error)
/*
* Make sure it can still be opened
*/
if (error != 0)
}
/*
* Completely obliterate one disk.
*/
static void
{
int fd;
if (zopt_maxfaults < 2)
return;
if (fd == -1)
/*
* Determine the size.
*/
/*
* Rename the old device to dev_name.old (useful for debugging).
*/
/*
* Create a new one.
*/
}
static void
{
char dev_name[MAXPATHLEN];
int error;
/*
* Build the nvlist describing dev_name.
*/
guid = 0;
else
if (error != 0 &&
}
static void
ztest_verify_blocks(char *pool)
{
int status;
char zbuf[1024];
char *bin;
char *ztest;
char *isa;
int isalen;
/* LINTED */
isa,
pool);
if (zopt_verbose >= 5)
if (zopt_verbose >= 3)
if (status == 0)
return;
ztest_dump_core = 0;
else
}
static void
ztest_walk_pool_directory(char *header)
{
if (zopt_verbose >= 6)
if (zopt_verbose >= 6)
}
static void
{
int error;
if (zopt_verbose >= 4) {
}
/*
* Clean up from previous runs.
*/
(void) spa_destroy(newname);
/*
* Get the pool's configuration and guid.
*/
if (error)
/*
*/
if (ztest_random(2) == 0)
ztest_walk_pool_directory("pools before export");
/*
* Export it.
*/
if (error)
ztest_walk_pool_directory("pools after export");
/*
* Try to import it.
*/
/*
* Import it under the new name.
*/
if (error)
ztest_walk_pool_directory("pools after import");
/*
* Try to import it again -- should fail with EEXIST.
*/
/*
* Try to import it under a different name -- should fail with EEXIST.
*/
/*
* Verify that the pool is no longer visible under the old name.
*/
/*
* Verify that we can open and close the pool using the new name.
*/
if (error)
}
static void
{
if (spa_suspended(spa)) {
(void) zio_resume(spa);
}
}
static void *
ztest_resume_thread(void *arg)
{
while (!ztest_exiting) {
}
return (NULL);
}
static void *
ztest_thread(void *arg)
{
int f, i;
/*
* See if it's time to force a crash.
*/
}
/*
* Pick a random function.
*/
f = ztest_random(ZTEST_FUNCS);
/*
* Decide whether to call it, based on the requested frequency.
*/
if (zi->zi_call_target == 0 ||
continue;
if (zopt_verbose >= 4) {
(void) printf("%6.2f sec in %s\n",
}
/*
* If we're getting ENOSPC with some regularity, stop.
*/
break;
}
return (NULL);
}
/*
* Kick off threads to run tests on all datasets in parallel.
*/
static void
{
int t, d, error;
char name[100];
NULL);
for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
/*
* Destroy one disk before we even start.
* It's mirrored, so everything should work just fine.
* This makes us exercise fault handling very early in spa_load().
*/
/*
* Verify that the sum of the sizes of all blocks in the pool
* equals the SPA's allocated space total.
*/
/*
* Kick off a replacement of the disk we just obliterated.
*/
if (zopt_verbose >= 5)
kernel_fini();
/*
* Verify that we can export the pool and reimport it under a
* different name.
*/
if (ztest_random(2) == 0) {
}
/*
* Verify that we can loop over all pools.
*/
if (zopt_verbose > 3) {
}
}
/*
* Open our pool.
*/
/*
* We don't expect the pool to suspend unless maxfaults == 0,
* in which case ztest_fault_inject() temporarily takes away
* the only valid replica.
*/
if (zopt_maxfaults == 0)
else
/*
* Create a thread to periodically resume suspended I/O.
*/
&resume_tid) == 0);
/*
* Verify that we can safely inquire about about any object,
* whether it's allocated or not. To make it interesting,
* we probe a 5-wide window around each power of two.
* This hits all edge cases, including zero and the max.
*/
for (t = 0; t < 64; t++) {
for (d = -5; d <= 5; d++) {
(1ULL << t) + d, NULL);
}
}
/*
* Now kick off all the tests that run in parallel.
*/
zs->zs_enospc_count = 0;
if (zopt_verbose >= 4)
(void) printf("starting main threads...\n");
for (t = 0; t < zopt_threads; t++) {
d = t % zopt_datasets;
za[t].za_instance = t;
if (t < zopt_datasets) {
int test_future = FALSE;
test_future = TRUE;
zs->zs_enospc_count++;
break;
} else if (error != 0) {
fatal(0, "dmu_objset_create(%s) = %d",
}
if (error)
fatal(0, "dmu_objset_open('%s') = %d",
if (test_future)
}
}
while (--t >= 0) {
if (t < zopt_datasets) {
}
}
if (zopt_verbose >= 3)
/*
* If we had out-of-space errors, destroy a random objset.
*/
if (zs->zs_enospc_count != 0) {
d = (int)ztest_random(zopt_datasets);
if (zopt_verbose >= 3)
/* Cleanup any non-standard clones and snapshots */
}
/* Kill the resume thread */
/*
* Right before closing the pool, kick off a bunch of async I/O;
* spa_close() should wait for it to complete.
*/
for (t = 1; t < 50; t++)
kernel_fini();
}
void
{
hrtime_t m = s / 60;
hrtime_t h = m / 60;
hrtime_t d = h / 24;
s -= m * 60;
m -= h * 60;
h -= d * 24;
timebuf[0] = '\0';
if (d)
"%llud%02lluh%02llum%02llus", d, h, m, s);
else if (h)
else if (m)
else
}
/*
* Create a storage pool with the given name and initial vdev size.
* Then create the specified number of datasets in the pool.
*/
static void
ztest_init(char *pool)
{
int error;
/*
* Create the storage pool.
*/
(void) spa_destroy(pool);
if (error)
if (error)
if (zopt_verbose >= 3)
kernel_fini();
}
int
{
int kills = 0;
int iters = 0;
int i, f;
char timebuf[100];
char numbuf[6];
/* Override location of zpool.cache */
spa_config_path = "/tmp/zpool.cache";
/*
* Blow away any existing copy of zpool.cache
*/
if (zopt_init != 0)
(void) remove("/tmp/zpool.cache");
if (zopt_verbose >= 1) {
(void) printf("%llu vdevs, %d datasets, %d threads,"
" %llu seconds...\n",
}
/*
* Create and initialize our storage pool.
*/
for (i = 1; i <= zopt_init; i++) {
(void) printf("ztest_init(), pass %d\n", i);
}
/*
* Initialize the call targets for each function.
*/
for (f = 0; f < ZTEST_FUNCS; f++) {
*zi = ztest_info[f];
if (*zi->zi_interval == 0)
else
}
/*
* Run the tests in a loop. These tests include fault injection
* to verify that self-healing data works, and forced crashes
* to verify that we never lose on-disk consistency.
*/
int status;
char *tmp;
/*
* Initialize the workload counters for each function.
*/
for (f = 0; f < ZTEST_FUNCS; f++) {
zi->zi_call_time = 0;
}
/* Set the allocation switch size */
if (pid == -1)
if (pid == 0) { /* child */
exit(0);
}
continue;
if (WEXITSTATUS(status) != 0) {
"child exited with code %d\n",
exit(2);
}
} else if (WIFSIGNALED(status)) {
"child died with signal %d\n",
exit(3);
}
kills++;
} else {
"to child\n");
exit(4);
}
iters++;
if (zopt_verbose >= 1) {
(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
"%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
}
if (zopt_verbose >= 2) {
(void) printf("\nWorkload summary:\n\n");
(void) printf("%7s %9s %s\n",
"Calls", "Time", "Function");
(void) printf("%7s %9s %s\n",
"-----", "----", "--------");
for (f = 0; f < ZTEST_FUNCS; f++) {
(void) printf("%7llu %9s %s\n",
}
(void) printf("\n");
}
/*
* It's possible that we killed a child during a rename test, in
* which case we'll have a 'ztest_tmp' pool lying around instead
* of 'ztest'. Do a blind rename in case this happened.
*/
kernel_fini();
}
if (zopt_verbose >= 1) {
(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
}
return (0);
}