vdev.c revision 2750f8d5ec1b891560ac2224f6c37243d910bd1b
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * CDDL HEADER START
34f9b3eef6fdadbda0a846aa4d68691ac40eace5Roland Mainz * The contents of this file are subject to the terms of the
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Common Development and Distribution License (the "License").
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * You may not use this file except in compliance with the License.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * See the License for the specific language governing permissions
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * and limitations under the License.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * When distributing Covered Code, include this CDDL HEADER in each
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * If applicable, add the following below this CDDL HEADER, with the
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * fields enclosed by brackets "[]" replaced with your own identifying
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * information: Portions Copyright [yyyy] [name of copyright owner]
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * CDDL HEADER END
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Virtual device management.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin/* maximum scrub/resilver I/O queue per leaf vdev */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * When a vdev is added, it will be divided into approximately (but no
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * more than) this number of metaslabs.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Given a vdev type, return the appropriate ops vector.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Default asize function: return the MAX of psize with the asize of
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * all children. This is what's used by anything other than RAID-Z.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Get the minimum allocatable size. We define the allocatable size as
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * the vdev's asize rounded to the nearest metaslab. This allows us to
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * replace or attach devices which don't have the same physical size but
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * can still satisfy the same number of allocations.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * If our parent is NULL (inactive spare or cache) or is the root,
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * just return our own asize.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * The top-level vdev just returns the allocatable size rounded
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * to the nearest metaslab.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * The allocatable space for a raidz vdev is N * sizeof(smallest child),
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * so each child must provide at least 1/Nth of its asize.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin int n = 0;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin return (n);
34f9b3eef6fdadbda0a846aa4d68691ac40eace5Roland Mainz ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
34f9b3eef6fdadbda0a846aa4d68691ac40eace5Roland Mainz oldsize = pvd->vdev_children * sizeof (vdev_t *);
34f9b3eef6fdadbda0a846aa4d68691ac40eace5Roland Mainz pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
34f9b3eef6fdadbda0a846aa4d68691ac40eace5Roland Mainz newsize = pvd->vdev_children * sizeof (vdev_t *);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Walk up all ancestors to update guid sum.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Walk up all ancestors to update guid sum.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Remove any holes in the child array.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Allocate and minimally initialize a vdev_t.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * The root vdev's guid will also be the pool guid,
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * which must be unique among all pools.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Any other vdev's guid must be unique within the pool.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin for (int t = 0; t < DTL_TYPES; t++) {
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Allocate a new vdev. The 'alloctype' is used to control whether we are
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * creating a new vdev or loading an existing one - the behavior is slightly
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * different for each case.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * If this is a load, get the vdev guid from the nvlist.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Otherwise, vdev_alloc_common() will generate one for us.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * The first allocated vdev must be of type 'root'.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Determine whether we're a log vdev.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Set the nparity property for RAID-Z vdevs.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Previous versions could only support 1 or 2 parity
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * We require the parity to be specified for SPAs that
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * support multiple parity levels.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Otherwise, we default to 1 parity device for RAID-Z.
char *aux;
for (int t = 0; t < DTL_TYPES; t++) {
for (t = 0; t < TXG_SIZE; t++) {
vdev_t *
return (mvd);
uint64_t m;
int error;
if (oldc != 0) {
if (txg == 0) {
if (error)
return (error);
if (error)
return (error);
if (txg == 0)
if (txg == 0)
uint64_t m;
for (m = 0; m < count; m++) {
typedef struct vdev_probe_stats {
int vps_flags;
zio_t *
return (NULL);
return (NULL);
return (pio);
return (NULL);
return (B_TRUE);
return (B_TRUE);
return (B_FALSE);
for (int c = 0; c < children; c++)
for (int c = 0; c < children; c++)
int error;
if (error) {
return (error);
psize = 0;
return (error);
* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
&aux_guid) != 0)
aux_guid = 0;
&guid) != 0 ||
&top_guid) != 0 ||
&state) != 0) {
int error;
return (error);
return (dirty);
return (empty);
static uint64_t
static uint64_t
static boolean_t
return (B_TRUE);
return (B_TRUE);
return (B_FALSE);
int minref;
if (scrub_txg != 0 &&
if (scrub_done)
if (txg != 0)
for (int t = 0; t < DTL_TYPES; t++) {
if (t == DTL_SCRUB)
if (t == DTL_PARTIAL)
int error = 0;
if (error)
return (error);
return (error);
if (error != 0)
return (error);
return (B_TRUE);
return (required);
return (needed);
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
if (reassess)
aux);
if (newstate)
if (postevent)
int error = 0;
top:
if (error)
goto top;
int error;
return (error);
return (B_FALSE);
return (B_TRUE);
for (int t = 0; t < ZIO_TYPES; t++) {
int corrupted = 0;
degraded++;
faulted++;
degraded++;
corrupted++;
const char *class;
switch (aux) {
case VDEV_AUX_OPEN_FAILED:
case VDEV_AUX_CORRUPT_DATA:
case VDEV_AUX_NO_REPLICAS:
case VDEV_AUX_BAD_GUID_SUM:
case VDEV_AUX_TOO_SMALL:
case VDEV_AUX_BAD_LABEL:
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_TRUE);
return (B_TRUE);
return (B_TRUE);
return (B_FALSE);