vdev_label.c revision 3e30c24aeefdee1631958ecf17f18da671781956
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * CDDL HEADER START
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * The contents of this file are subject to the terms of the
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Common Development and Distribution License (the "License").
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * You may not use this file except in compliance with the License.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * See the License for the specific language governing permissions
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * and limitations under the License.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * When distributing Covered Code, include this CDDL HEADER in each
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * If applicable, add the following below this CDDL HEADER, with the
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * fields enclosed by brackets "[]" replaced with your own identifying
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * information: Portions Copyright [yyyy] [name of copyright owner]
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * CDDL HEADER END
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Copyright (c) 2013 by Delphix. All rights reserved.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Virtual Device Labels
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * ---------------------
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * The vdev label serves several distinct purposes:
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 1. Uniquely identify this device as part of a ZFS pool and confirm its
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * identity within the pool.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 2. Verify that all the devices given in a configuration are present
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * within the pool.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 3. Determine the uberblock for the pool.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 4. In case of an import operation, determine the configuration of the
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * toplevel vdev of which it is a part.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 5. If an import operation cannot find all the devices in the pool,
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * provide enough information to the administrator to determine which
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * devices are missing.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * It is important to note that while the kernel is responsible for writing the
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * label, it only consumes the information in the first three cases. The
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * latter information is only consumed in userland when determining the
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * configuration to import a pool.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Label Organization
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * ------------------
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Before describing the contents of the label, it's important to understand how
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * the labels are written and updated with respect to the uberblock.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * When the pool configuration is altered, either because it was newly created
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * or a device was added, we want to update all the labels such that we can deal
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * with fatal failure at any point. To this end, each disk has two labels which
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * are updated before and after the uberblock is synced. Assuming we have
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * labels and an uberblock with the following transaction groups:
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * +------+ +------+ +------+
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * | | | | | |
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * | t10 | | t10 | | t10 |
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * | | | | | |
548846fe158900a483ca91f47603c6bb6fde9b47Jon Branch * +------+ +------+ +------+
548846fe158900a483ca91f47603c6bb6fde9b47Jon Branch * In this stable state, the labels and the uberblock were all updated within
548846fe158900a483ca91f47603c6bb6fde9b47Jon Branch * the same transaction group (10). Each label is mirrored and checksummed, so
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * that we can detect when we fail partway through writing the label.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * In order to identify which labels are valid, the labels are written in the
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * following manner:
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 1. For each vdev, update 'L1' to the new label
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 2. Update the uberblock
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * 3. For each vdev, update 'L2' to the new label
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Given arbitrary failure, we can determine the correct label to use based on
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * the transaction group. If we fail after updating L1 but before updating the
fe644a7302b3235c08aec5fd7992a329f2ee1364Laszlo Hordos * UB, we will notice that L1's transaction group is greater than the uberblock,
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * so L2 must be valid. If we fail after writing the uberblock but before
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * writing L2, we will notice that L2's transaction group is less than L1, and
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * therefore L1 is valid.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * Another added complexity is that not every label is updated when the config
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * is synced. If we add a single device, we do not want to have to re-write
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * every label for every device in the pool. This means that both L1 and L2 may
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * be older than the pool uberblock, because the necessary information is stored
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * on another vdev.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * On-disk Format
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * --------------
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * The vdev label consists of two distinct parts, and is wrapped within the
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * vdev_label_t structure. The label includes 8k of padding to permit legacy
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * VTOC disk labels, but is otherwise ignored.
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * The first half of the label is a packed nvlist which contains pool wide
0fdda69ce3627d501e4bb3103765f676bb1ab061Laszlo Hordos * properties, per-vdev properties, and configuration information. It is
* The 'vs' configuration follows the format described in 'spa_config.c'.
nvlist_t *
VDEV_TYPE_RAIDZ) == 0);
if (getstats) {
int c, idx;
KM_SLEEP);
if (idx) {
for (c = 0; c < idx; c++)
B_TRUE) == 0);
B_TRUE) == 0);
B_TRUE) == 0);
B_TRUE) == 0);
B_TRUE) == 0);
B_TRUE) == 0);
B_TRUE) == 0);
case VDEV_AUX_ERR_EXCEEDED:
case VDEV_AUX_EXTERNAL:
aux) == 0);
return (nv);
if (idx) {
nvlist_t *
int error = 0;
return (NULL);
for (int l = 0; l < VDEV_LABELS; l++) {
&label, 0) == 0) {
goto retry;
return (config);
static boolean_t
if (spare_guid)
if (l2cache_guid)
return (B_FALSE);
&vdtxg);
&state) != 0 ||
&device_guid) != 0) {
return (B_FALSE);
&pool_guid) != 0 ||
&txg) != 0)) {
return (B_FALSE);
return (B_FALSE);
return (B_TRUE);
if (spare_guid)
switch (reason) {
case VDEV_LABEL_CREATE:
case VDEV_LABEL_L2CACHE:
return (B_TRUE);
case VDEV_LABEL_REPLACE:
case VDEV_LABEL_SPARE:
return (B_TRUE);
char *pad2;
char *buf;
int error;
return (error);
POOL_STATE_SPARE) == 0);
POOL_STATE_L2CACHE) == 0);
crtxg) == 0);
if (error != 0) {
for (int l = 0; l < VDEV_LABELS; l++) {
goto retry;
return (error);
struct ubl_cbdata {
for (int l = 0; l < VDEV_LABELS; l++) {
for (int l = 0; l < VDEV_LABELS; l++)
for (int v = 0; v < svdcount; v++)
for (int v = 0; v < svdcount; v++)
if (*good_writes == 0)
char *buf;
sizeof (vdev_phys_t),
int error;
KM_SLEEP);
return (error);
int error;
if (tryhard)
return (error);
return (error);