/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2016 Gary Mills
*/
#include <sys/dsl_scan.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dmu_objset.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zil_impl.h>
#include <sys/zio_checksum.h>
#include <sys/zfeature.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
const zbookmark_phys_t *);
static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
/* max number of blocks to free in a single TXG */
extern int zfs_txg_timeout;
/*
*/
/* the order has to match pool_scan_type */
NULL,
dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
};
int
{
int err;
uint64_t f;
/*
* It's possible that we're resuming a scan after a reboot so
* make sure that the scan_async_destroying flag is initialized
* appropriately.
*/
if (err == 0) {
/*
* There was an old-style scrub in progress. Restart a
* new-style scrub from the beginning.
*/
zfs_dbgmsg("old-style scrub was in progress; "
"restarting new-style scrub in txg %llu",
/*
* Load the queue obj from the old location so that it
* can be freed by dsl_scan_done().
*/
} else {
return (0);
else if (err)
return (err);
/*
* A new-type scrub was in progress on an old
* pool, and the pool was accessed by old
* software. Restart from the beginning, since
* the old software may have changed the pool in
* the meantime.
*/
zfs_dbgmsg("new-style scrub was modified "
"by old software; restarting in txg %llu",
}
}
return (0);
}
void
{
}
}
/* ARGSUSED */
static int
{
return (0);
}
static void
{
scn->scn_restart_txg = 0;
scn->scn_done_txg = 0;
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
/* rewrite all disk labels */
} else {
}
/*
* If this is an incremental scrub, limit the DDT scrub phase
* to just the auto-ditto class (for correctness); the rest
* of the scrub should go faster using top-down pruning.
*/
}
/* back to the generic stuff */
dp->dp_blkstats =
}
"func=%u mintxg=%llu maxtxg=%llu",
}
/* ARGSUSED */
static void
{
static const char *old_names[] = {
"scrub_bookmark",
"scrub_ddt_bookmark",
"scrub_ddt_class_max",
"scrub_queue",
"scrub_min_txg",
"scrub_max_txg",
"scrub_func",
"scrub_errors",
};
int i;
/* Remove any remnants of an old-style scrub. */
for (i = 0; old_names[i]; i++) {
}
}
/*
* If we were "restarted" from a stopped state, don't bother
* with anything else.
*/
return;
if (complete)
else
else if (!complete)
else
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
while (spa->spa_scrub_inflight > 0) {
&spa->spa_scrub_lock);
}
/*
* reflect this. Whether it succeeded or not, vacate
* all temporary scrub DTLs.
*/
if (complete) {
}
/*
* We may have finished replacing a device.
* Let the async thread assess this and handle the detach.
*/
}
}
/* ARGSUSED */
static int
{
return (0);
}
/* ARGSUSED */
static void
{
}
int
{
}
void
{
}
void
{
}
static uint64_t
{
if (ds->ds_is_snapshot)
return (smt);
}
static void
{
}
extern int zfs_vdev_async_write_active_min_dirty_percent;
static boolean_t
{
return (B_FALSE);
if (scn->scn_pausing)
return (B_TRUE); /* we're already pausing */
return (B_FALSE); /* we're resuming */
/* We only know how to resume from level-0 blocks. */
return (B_FALSE);
/*
* We pause if:
* - we have scanned for the maximum time: an entire txg
* timeout (default 5 sec)
* or
* - we have scanned for at least the minimum time (default 1 sec
* for scrub, 3 sec for resilver), and either we have sufficient
* dirty data that we are starting to write more quickly
* (default 30%), or someone is explicitly waiting for this txg
* to complete.
* or
* - the spa is shutting down because this pool is being exported
* or the machine is rebooting.
*/
if (zb) {
dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
}
dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
return (B_TRUE);
}
return (B_FALSE);
}
typedef struct zil_scan_arg {
/* ARGSUSED */
static int
{
return (0);
/*
* One block ("stubby") can be allocated a long time ago; we
* want to visit that one because it has been allocated
* (on-disk) even if it hasn't been claimed (even though for
* scrub there's nothing to do to it).
*/
return (0);
return (0);
}
/* ARGSUSED */
static int
{
if (BP_IS_HOLE(bp) ||
return (0);
/*
* birth can be < claim_txg if this record's txg is
* already txg sync'ed (but this log block contains
* other records that are not synced)
*/
return (0);
}
return (0);
}
static void
{
/*
* We only want to visit blocks that have been claimed but not yet
* replayed (or, in read-only mode, blocks that *would* be claimed).
*/
return;
}
/* ARGSUSED */
static void
{
return;
return;
}
static boolean_t
const zbookmark_phys_t *zb)
{
/*
*/
/*
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
return (B_TRUE);
/*
* If we found the block we're trying to resume from, or
* we went past it to a different object, zero it out to
* indicate that it's OK to start checking for pausing
* again.
*/
dprintf("resuming at %llx/%llx/%llx/%llx\n",
}
}
return (B_FALSE);
}
/*
* Return nonzero on i/o error.
* Return new buf to write out in *bufp.
*/
static int
{
int err;
if (BP_GET_LEVEL(bp) > 0) {
int i;
if (err) {
return (err);
}
}
}
int i, j;
if (err) {
return (err);
}
for (j = 0; j < cdnp->dn_nblkptr; j++) {
}
}
}
if (err) {
return (err);
}
if (OBJSET_BUF_HAS_USERUSED(buf)) {
/*
* objects, and never skip them, even if we are
* pausing. This is necessary so that the space
* deltas from this txg get integrated.
*/
}
}
return (0);
}
static void
{
int j;
for (j = 0; j < dnp->dn_nblkptr; j++) {
}
0, DMU_SPILL_BLKID);
}
}
/*
* The arguments are in this order because mdb can only print the
* first 5; we want them to be useful.
*/
static void
{
/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
return;
return;
if (BP_IS_HOLE(bp))
return;
"visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
bp);
return;
return;
/*
* If dsl_scan_ddt() has aready visited this block, it will have
* already done any translations or scrubbing, so don't call the
* callback again.
*/
return;
}
/*
* If this block is from the future (after cur_max_txg), then we
* are doing this on behalf of a deleted snapshot, and we will
* revisit the future block on the next pass of this dataset.
* Don't scan it now unless we need to because something
* under it was modified.
*/
}
}
static void
{
}
void
{
return;
if (ds->ds_is_snapshot) {
/*
* Note:
* - scn_cur_{min,max}_txg stays the same.
* - Setting the flag is not really necessary if
* scn_cur_max_txg == scn_max_txg, because there
* is nothing after this snapshot that we care
* about. However, we set it anyway and then
* ignore it when we retraverse it in
* dsl_scan_visitds().
*/
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset zb_objset to %llu",
} else {
ZB_DESTROYED_OBJSET, 0, 0, 0);
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset bookmark to -1,0,0,0",
}
if (ds->ds_is_snapshot) {
/*
* We keep the same mintxg; it could be >
* ds_creation_txg if the previous snapshot was
* deleted too.
*/
zfs_dbgmsg("destroying ds %llu; in queue; "
"replacing with %llu",
} else {
zfs_dbgmsg("destroying ds %llu; in queue; removing",
}
}
/*
* dsl_scan_sync() should be called after this, and should sync
* out our changed state, but just to be safe, do it here.
*/
}
void
{
return;
zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
"reset zb_objset to %llu",
zfs_dbgmsg("snapshotting ds %llu; in queue; "
"replacing with %llu",
}
}
void
{
return;
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
}
int err;
/* Both were there to begin with */
}
zfs_dbgmsg("clone_swap ds %llu; in queue; "
"replacing with %llu",
zfs_dbgmsg("clone_swap ds %llu; in queue; "
"replacing with %llu",
}
}
struct enqueue_clones_arg {
};
/* ARGSUSED */
static int
{
int err;
return (0);
if (err)
return (err);
if (err)
return (err);
}
return (0);
}
static void
{
/*
* This can happen if this snapshot was created after the
* scan started, and we already completed a previous snapshot
* that was created after the scan started. This snapshot
* only references blocks with:
*
* birth < our ds_creation_txg
* cur_min_txg is no less than ds_creation_txg.
* We have already visited these blocks.
* or
* birth > scn_max_txg
* The scan requested not to visit these blocks.
*
* Subsequent snapshots (and clones) can reference our
* blocks, or blocks with even higher birth times.
* Therefore we do not need to visit them either,
* so we do not add them to the work queue.
*
* Note that checking for cur_min_txg >= cur_max_txg
* is not sufficient, because in that case we may need to
* visit subsequent snapshots. This happens when min_txg > 0,
* which raises cur_min_txg. In this case we will visit
* this dataset but skip all of its blocks, because the
* rootbp's birth time is < cur_min_txg. Then we will
*/
zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
"cur_min_txg (%llu) >= max_txg (%llu)",
goto out;
}
goto out;
/*
* Only the ZIL in the head (non-snapshot) is valid. Even though
* snapshots can have ZIL block pointers (which may be the same
* BP as in the head), they must be ignored. So we traverse the
* ZIL here, rather than in scan_recurse(), because the regular
* snapshot block-sharing rules don't apply to it.
*/
/*
* Iterate over the bps in this ds.
*/
zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
"pausing=%u",
(int)scn->scn_pausing);
if (scn->scn_pausing)
goto out;
/*
* We've finished this pass over this dataset.
*/
/*
* If we did not completely visit this dataset, do another pass.
*/
zfs_dbgmsg("incomplete pass; visiting again");
goto out;
}
/*
* Add descendent datasets to work queue.
*/
}
/*
* A bug in a previous version of the code could
* cause upgrade_clones_cb() to not set
* ds_next_snap_obj when it should, leading to a
* missing entry. Therefore we can only use the
* next_clones_obj when its count is correct.
*/
if (err == 0 &&
}
if (usenext) {
} else {
}
}
out:
}
/* ARGSUSED */
static int
{
int err;
if (err)
return (err);
if (err) {
return (err);
}
/*
* If this is a clone, we don't need to worry about it for now.
*/
return (0);
}
}
return (0);
}
/*
*
* If there are N references to a deduped block, we don't want to scrub it
* N times -- ideally, we should scrub it exactly once.
*
* We leverage the fact that the dde's replication class (enum ddt_class)
* is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
* (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
*
* To prevent excess scrubbing, the scrub begins by walking the DDT
* to find all blocks with refcnt > 1, and scrubs each of these once.
* Since there are two replication classes which contain blocks with
* refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
* Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
*
* There would be nothing more to say if a block's refcnt couldn't change
* during a scrub, but of course it can so we must account for changes
* in a block's replication class.
*
* Here's an example of what can occur:
*
* If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
* when visited during the top-down scrub phase, it will be scrubbed twice.
* This negates our scrub optimization, but is otherwise harmless.
*
* If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
* on each visit during the top-down scrub phase, it will never be scrubbed.
* To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
* reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
* DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
* while a scrub is in progress, it scrubs the block right then.
*/
static void
{
int error;
uint64_t n = 0;
break;
dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
/* There should be no pending changes to the dedup table */
n++;
break;
}
zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
(int)scn->scn_pausing);
}
/* ARGSUSED */
void
{
return;
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
continue;
}
}
static void
{
if (scn->scn_pausing)
return;
}
/* First do the MOS & ORIGIN */
if (scn->scn_pausing)
return;
} else {
}
/*
* If we were paused, continue from here. Note if the
* ds we were paused on was deleted, the zb_objset may
* be -1, so we will skip this and find a new objset
* below.
*/
if (scn->scn_pausing)
return;
}
/*
* In case we were paused right at the end of the ds, zero the
* bookmark so we don't think that we're still trying to resume.
*/
/* keep pulling things out of the zap-object-as-queue */
if (za.za_first_integer != 0) {
} else {
}
if (scn->scn_pausing)
return;
}
}
static boolean_t
{
if (zfs_recover)
return (B_FALSE);
return (B_TRUE);
}
static int
{
if (!scn->scn_is_bptree ||
}
return (0);
}
{
return (B_FALSE);
if (spa_shutting_down(spa))
return (B_FALSE);
return (B_TRUE);
}
return (used != 0);
}
void
{
int err = 0;
/*
* Check for scn_restart_txg before checking spa_load_state, so
* that we can restart an old-style scan while the pool is being
* imported (see dsl_scan_init).
*/
zfs_dbgmsg("restarting scan func=%u txg=%llu",
}
/*
* Only process scans in sync pass 1.
*/
return;
/*
* If the spa is shutting down, then stop scanning. This will
* ensure that the scan does not dirty any new data during the
* shutdown phase.
*/
if (spa_shutting_down(spa))
return;
/*
* If the scan is inactive due to a stalled async destroy, try again.
*/
return;
scn->scn_visited_this_txg = 0;
/*
* First process the async destroys. If we pause, don't do
* any scrubbing or resilvering. This ensures that there are no
* async destroys while we are scanning, so the scan code doesn't
* have to worry about traversing it. It is also faster to free the
* blocks than to scrub them.
*/
if (zfs_free_bpobj_enabled &&
}
err = 0;
zfs_panic_recover("error %u from "
"traverse_dataset_destroyed()", err);
}
/* finished; deactivate async destroy feature */
dp->dp_bptree_obj = 0;
} else {
/*
* If we didn't make progress, mark the async
* destroy as stalled, so that we will not initiate
* a spa_sync() on its behalf. Note that we only
* check this if we are not finished, because if the
* bptree had no blocks for us to visit, we can
* finish without "making progress".
*/
(scn->scn_visited_this_txg == 0);
}
}
if (scn->scn_visited_this_txg) {
zfs_dbgmsg("freed %llu blocks in %llums from "
"free_bpobj/bptree txg %llu; err=%u",
scn->scn_visited_this_txg = 0;
/*
* Write out changes to the DDT that may be required as a
* result of the blocks freed. This ensures that the DDT
*/
}
if (err != 0)
return;
/*
* We have finished background destroying, but there is still
* some space left in the dp_free_dir. Transfer this leaked
* space to the dp_leak_dir.
*/
LEAK_DIR_NAME, tx);
}
}
/* finished; verify that space accounting went to zero */
}
return;
/* finished with scan. */
return;
}
zfs_dbgmsg("doing scan sync txg %llu; "
"ddt bm=%llu/%llu/%llu/%llx",
} else {
zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
}
zfs_dbgmsg("visited %llu blocks in %llums",
if (!scn->scn_pausing) {
zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
}
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
while (spa->spa_scrub_inflight > 0) {
&spa->spa_scrub_lock);
}
}
}
/*
* This will start a new scan, or restart an existing one.
*/
void
{
if (txg == 0) {
} else {
}
}
{
}
/*
* scrub consumers
*/
static void
{
int i;
/*
* If we resume after a reboot, zab will be NULL; don't record
* incomplete stats in that case.
*/
return;
for (i = 0; i < 4; i++) {
if (t & DMU_OT_NEWTYPE)
t = DMU_OT_OTHER;
int equal;
switch (BP_GET_NDVAS(bp)) {
case 2:
break;
case 3:
if (equal == 1)
else if (equal == 3)
break;
}
}
}
static void
{
}
}
static int
{
int scan_delay = 0;
return (0);
if (BP_IS_EMBEDDED(bp))
return (0);
} else {
}
/* If it's an intent log block, failure is expected. */
for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
/*
* Keep track of how much data we've examined so that
* zpool(1M) status can make useful progress reports.
*/
/* if it's a resilver, this may not be in the target range */
if (!needs_io) {
/*
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
* scrub range, which has already been checked.
* XXX -- it would be better to change our
* allocation policy to ensure that all
* gang members reside on the same vdev.
*/
} else {
phys_birth, 1);
}
}
}
if (needs_io && !zfs_no_scrub_io) {
/*
* then throttle our workload to limit the impact of a scan.
*/
}
/* do not relocate this block */
return (0);
}
int
{
/*
* Purge all vdev caches and probe all devices. We do this here
* rather than in sync context because this requires a writer lock
* on the spa_config lock, which we can't do from sync context. The
* spa_scrub_reopen flag indicates that vdev_open() should not
* attempt to start another scrub.
*/
}
static boolean_t
{
return (scn->scn_restart_txg != 0 &&
}