vdev.c revision 44ecc5327ab4ce0750dcca2a17e05566bf2812e2
4496171313bed39e96f21bc2f9faf2868e267ae3girish * CDDL HEADER START
4496171313bed39e96f21bc2f9faf2868e267ae3girish * The contents of this file are subject to the terms of the
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Common Development and Distribution License (the "License").
4496171313bed39e96f21bc2f9faf2868e267ae3girish * You may not use this file except in compliance with the License.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
4496171313bed39e96f21bc2f9faf2868e267ae3girish * See the License for the specific language governing permissions
4496171313bed39e96f21bc2f9faf2868e267ae3girish * and limitations under the License.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * When distributing Covered Code, include this CDDL HEADER in each
4496171313bed39e96f21bc2f9faf2868e267ae3girish * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * If applicable, add the following below this CDDL HEADER, with the
4496171313bed39e96f21bc2f9faf2868e267ae3girish * fields enclosed by brackets "[]" replaced with your own identifying
4496171313bed39e96f21bc2f9faf2868e267ae3girish * information: Portions Copyright [yyyy] [name of copyright owner]
4496171313bed39e96f21bc2f9faf2868e267ae3girish * CDDL HEADER END
fb2f18f820d90b001aea4fb27dd654bc1263c440esaxe * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Virtual device management.
4496171313bed39e96f21bc2f9faf2868e267ae3girish/* maximum scrub/resilver I/O queue per leaf vdev */
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Given a vdev type, return the appropriate ops vector.
4496171313bed39e96f21bc2f9faf2868e267ae3girish for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Default asize function: return the MAX of psize with the asize of
4496171313bed39e96f21bc2f9faf2868e267ae3girish * all children. This is what's used by anything other than RAID-Z.
4496171313bed39e96f21bc2f9faf2868e267ae3girish uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
4496171313bed39e96f21bc2f9faf2868e267ae3girish csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Get the minimum allocatable size. We define the allocatable size as
4496171313bed39e96f21bc2f9faf2868e267ae3girish * the vdev's asize rounded to the nearest metaslab. This allows us to
4496171313bed39e96f21bc2f9faf2868e267ae3girish * replace or attach devices which don't have the same physical size but
4496171313bed39e96f21bc2f9faf2868e267ae3girish * can still satisfy the same number of allocations.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * The our parent is NULL (inactive spare or cache) or is the root,
4496171313bed39e96f21bc2f9faf2868e267ae3girish * just return our own asize.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * The top-level vdev just returns the allocatable size rounded
4496171313bed39e96f21bc2f9faf2868e267ae3girish * to the nearest metaslab.
4496171313bed39e96f21bc2f9faf2868e267ae3girish return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
4496171313bed39e96f21bc2f9faf2868e267ae3girish * The allocatable space for a raidz vdev is N * sizeof(smallest child),
4496171313bed39e96f21bc2f9faf2868e267ae3girish * so each child must provide at least 1/Nth of its asize.
4496171313bed39e96f21bc2f9faf2868e267ae3girish ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
459190a5c46206e7885f6a649a055ceb46be49a7rsmaeda if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
459190a5c46206e7885f6a649a055ceb46be49a7rsmaeda ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
fb2f18f820d90b001aea4fb27dd654bc1263c440esaxe ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
459190a5c46206e7885f6a649a055ceb46be49a7rsmaeda pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
4496171313bed39e96f21bc2f9faf2868e267ae3girish cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
459190a5c46206e7885f6a649a055ceb46be49a7rsmaeda ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Walk up all ancestors to update guid sum.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Walk up all ancestors to update guid sum.
4496171313bed39e96f21bc2f9faf2868e267ae3girish * Remove any holes in the child array.
4496171313bed39e96f21bc2f9faf2868e267ae3girish ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Allocate and minimally initialize a vdev_t.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dpvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * The root vdev's guid will also be the pool guid,
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * which must be unique among all pools.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Any other vdev's guid must be unique within the pool.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp for (int t = 0; t < DTL_TYPES; t++) {
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp return (vd);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Allocate a new vdev. The 'alloctype' is used to control whether we are
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * creating a new vdev or loading an existing one - the behavior is slightly
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * different for each case.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dpvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * If this is a load, get the vdev guid from the nvlist.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Otherwise, vdev_alloc_common() will generate one for us.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * The first allocated vdev must be of type 'root'.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Determine whether we're a log vdev.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Set the nparity property for RAID-Z vdevs.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Previous versions could only support 1 or 2 parity
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * We require the parity to be specified for SPAs that
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * support multiple parity levels.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Otherwise, we default to 1 parity device for RAID-Z.
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
102033aa92edf302ad31b3bdd7c6fcd2d6910903dp * Set the whole_disk property. If it's not specified, leave the value
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp * Look for the 'not present' flag. This will only be set if the device
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp * was not present at the time of import.
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp * Get the alignment requirement.
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp * Retrieve the vdev creation time.
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp * If we're a top-level vdev, try to load the allocation parameters.
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
fe70c9cf90dfc23d18485fb7b4b20a1175d53a8bdp (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
char *aux;
for (int t = 0; t < DTL_TYPES; t++) {
for (t = 0; t < TXG_SIZE; t++) {
vdev_t *
return (mvd);
uint64_t m;
int error;
if (oldc != 0) {
if (txg == 0) {
if (error)
return (error);
if (object != 0) {
if (error)
return (error);
if (txg == 0)
if (txg == 0)
uint64_t m;
for (m = 0; m < count; m++)
typedef struct vdev_probe_stats {
int vps_flags;
zio_t *
return (NULL);
return (NULL);
return (pio);
return (NULL);
return (B_TRUE);
return (B_TRUE);
return (B_FALSE);
for (int c = 0; c < children; c++)
for (int c = 0; c < children; c++)
int error;
return (ENXIO);
return (ENXIO);
if (error) {
return (error);
return (ENXIO);
return (EOVERFLOW);
return (EOVERFLOW);
psize = 0;
return (EINVAL);
return (EINVAL);
return (error);
* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
return (EBADF);
&aux_guid) != 0)
aux_guid = 0;
&guid) != 0 ||
&top_guid) != 0 ||
&state) != 0) {
return (EBADF);
int error;
return (error);
return (dirty);
return (empty);
int minref;
if (scrub_txg != 0 &&
if (scrub_done)
if (txg != 0)
for (int t = 0; t < DTL_TYPES; t++) {
if (t == DTL_SCRUB)
if (t == DTL_PARTIAL)
int error;
return (error);
return (error);
&smlock);
return (B_TRUE);
return (required);
return (needed);
if (reassess)
aux);
if (newstate)
int error = 0;
top:
if (error)
goto top;
int error;
return (error);
return (B_FALSE);
return (B_TRUE);
for (int t = 0; t < ZIO_TYPES; t++) {
int corrupted = 0;
degraded++;
faulted++;
degraded++;
corrupted++;
const char *class;
switch (aux) {
case VDEV_AUX_OPEN_FAILED:
case VDEV_AUX_CORRUPT_DATA:
case VDEV_AUX_NO_REPLICAS:
case VDEV_AUX_BAD_GUID_SUM:
case VDEV_AUX_TOO_SMALL:
case VDEV_AUX_BAD_LABEL:
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_FALSE);
return (B_TRUE);