dsl_pool.c revision 88b7b0f29b20b808b9e06071885b1d6a3ddb6328
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER START
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The contents of this file are subject to the terms of the
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock * Common Development and Distribution License (the "License").
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock * You may not use this file except in compliance with the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
fa9e4066f08beec538e775443c5be79dd423fcabahrens * See the License for the specific language governing permissions
fa9e4066f08beec538e775443c5be79dd423fcabahrens * and limitations under the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * When distributing Covered Code, include this CDDL HEADER in each
fa9e4066f08beec538e775443c5be79dd423fcabahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If applicable, add the following below this CDDL HEADER, with the
fa9e4066f08beec538e775443c5be79dd423fcabahrens * fields enclosed by brackets "[]" replaced with your own identifying
fa9e4066f08beec538e775443c5be79dd423fcabahrens * information: Portions Copyright [yyyy] [name of copyright owner]
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER END
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Use is subject to license terms.
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybeeint zfs_write_limit_shift = 3; /* 1/8th of physical memory */
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybeeint zfs_txg_synctime = 5; /* target secs to sync a txg */
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybeeuint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybeeuint64_t zfs_write_limit_max = 0; /* max data payload per txg */
088f389458728c464569a5506b58070254fa4f7dahrensdsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
3cb34c601f3ef3016f638574f5982e80c3735c71ahrens list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
088f389458728c464569a5506b58070254fa4f7dahrens mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrockdsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
088f389458728c464569a5506b58070254fa4f7dahrens err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
088f389458728c464569a5506b58070254fa4f7dahrens err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
088f389458728c464569a5506b58070254fa4f7dahrens err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
088f389458728c464569a5506b58070254fa4f7dahrens err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
088f389458728c464569a5506b58070254fa4f7dahrens /* get scrub status */
088f389458728c464569a5506b58070254fa4f7dahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
088f389458728c464569a5506b58070254fa4f7dahrens if (err == 0) {
088f389458728c464569a5506b58070254fa4f7dahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
088f389458728c464569a5506b58070254fa4f7dahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
088f389458728c464569a5506b58070254fa4f7dahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
088f389458728c464569a5506b58070254fa4f7dahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
088f389458728c464569a5506b58070254fa4f7dahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
088f389458728c464569a5506b58070254fa4f7dahrens * A new-type scrub was in progress on an old
088f389458728c464569a5506b58070254fa4f7dahrens * pool. Restart from the beginning, since the
088f389458728c464569a5506b58070254fa4f7dahrens * old software may have changed the pool in the
088f389458728c464569a5506b58070254fa4f7dahrens * meantime.
088f389458728c464569a5506b58070254fa4f7dahrens * It's OK if there is no scrub in progress (and if
088f389458728c464569a5506b58070254fa4f7dahrens * there was an I/O error, ignore it).
088f389458728c464569a5506b58070254fa4f7dahrens /* drop our references from dsl_pool_open() */
088f389458728c464569a5506b58070254fa4f7dahrens * Since we held the origin_snap from "syncing" context (which
088f389458728c464569a5506b58070254fa4f7dahrens * includes pool-opening context), it actually only got a "ref"
088f389458728c464569a5506b58070254fa4f7dahrens * and not a hold, so just drop that here.
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
88b7b0f29b20b808b9e06071885b1d6a3ddb6328Matthew Ahrens kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
0a48a24e663a04e34e2ed4e55390ad96f178dbeatimhdsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
088f389458728c464569a5506b58070254fa4f7dahrens /* create and open the MOS (meta-objset) */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* create the pool directory */
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* create and open the root dir */
088f389458728c464569a5506b58070254fa4f7dahrens dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* create and open the meta-objset dir */
088f389458728c464569a5506b58070254fa4f7dahrens (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
088f389458728c464569a5506b58070254fa4f7dahrens /* create the root dataset */
088f389458728c464569a5506b58070254fa4f7dahrens dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
088f389458728c464569a5506b58070254fa4f7dahrens /* create the root objset */
088f389458728c464569a5506b58070254fa4f7dahrens VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
c717a56157ae0e6fca6a1e3689ae1edc385716a3maybee zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
c717a56157ae0e6fca6a1e3689ae1edc385716a3maybee while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
c717a56157ae0e6fca6a1e3689ae1edc385716a3maybee while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
c717a56157ae0e6fca6a1e3689ae1edc385716a3maybee while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
fa9e4066f08beec538e775443c5be79dd423fcabahrens list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
c717a56157ae0e6fca6a1e3689ae1edc385716a3maybee zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
fa9e4066f08beec538e775443c5be79dd423fcabahrens dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee data_written = dp->dp_space_towrite[txg & TXG_MASK];
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * If the write limit max has not been explicitly set, set it
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * to a fraction of available physical memory (default 1/8th).
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * Note that we must inflate the limit because the spa
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * inflates write sizes to account for data replication.
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * Check this each sync phase to catch changing memory size.
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee if (physmem != old_physmem && zfs_write_limit_shift) {
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee zfs_write_limit_inflated = MAX(zfs_write_limit_min,
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee spa_get_asize(dp->dp_spa, zfs_write_limit_max));
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * Attempt to keep the sync time consistent by adjusting the
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * amount of write traffic allowed into each transaction group.
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * Weight the throughput calculation towards the current value:
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee * thru = 3/4 old_thru + 1/4 new_thru
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee uint64_t throughput = (data_written * NANOSEC) / write_time;
05715f945c5c007fc4bb6a4e7cf4a749c9b30038Mark Maybee dp->dp_write_limit = MIN(zfs_write_limit_inflated,
fa9e4066f08beec538e775443c5be79dd423fcabahrens zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
c717a56157ae0e6fca6a1e3689ae1edc385716a3maybee * TRUE if the current thread is the tx_sync_thread or if we
c717a56157ae0e6fca6a1e3689ae1edc385716a3maybee * are being called from SPA context during pool initialization.
fa9e4066f08beec538e775443c5be79dd423fcabahrensdsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
44cd46cadd9aab751dae6a4023c1cb5bf316d274billm * Reserve about 1.6% (1/64), or at least 32MB, for allocation
fa9e4066f08beec538e775443c5be79dd423fcabahrens * efficiency.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * XXX The intent log is not accounted for, so it must fit
fa9e4066f08beec538e775443c5be79dd423fcabahrens * within this slop.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If we're trying to assess whether it's OK to do a free,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * cut the reservation in half to allow forward progress
fa9e4066f08beec538e775443c5be79dd423fcabahrens * (e.g. make it possible to rm(1) files from a full pool).
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybeedsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
c5904d138f3bdf0762dbf452a43d5a5c387ea6a8eschrock atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee return (0);
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * Check to see if we have exceeded the maximum allowed IO for
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * this transaction group. We can do this without locks since
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * a little slop here is ok. Note that we do the reserved check
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * with only half the requested reserve: this is because the
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * reserve requests are worst-case, and we really don't want to
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * throttle based off of worst-case estimates.
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * If this transaction group is over 7/8ths capacity, delay
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * the caller 1 clock tick. This will slow down the "fill"
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee * rate until the sync process can catch up with us.
e8397a2be4690aefe43370aae2d4214c6778327egw if (reserved && reserved > (write_limit - (write_limit >> 3)))
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee return (0);
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybeedsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee for (i = 0; i < TXG_SIZE; i++) {
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybeedsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
1ab7f2ded02e7a1bc3c73516eb27efa79bf2a2ffmaybee dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
088f389458728c464569a5506b58070254fa4f7dahrens/* ARGSUSED */
088f389458728c464569a5506b58070254fa4f7dahrensupgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
088f389458728c464569a5506b58070254fa4f7dahrens err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
088f389458728c464569a5506b58070254fa4f7dahrens if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
088f389458728c464569a5506b58070254fa4f7dahrens * The $ORIGIN can't have any data, or the accounting
088f389458728c464569a5506b58070254fa4f7dahrens * will be wrong.
088f389458728c464569a5506b58070254fa4f7dahrens /* The origin doesn't get attached to itself */
088f389458728c464569a5506b58070254fa4f7dahrens return (0);
088f389458728c464569a5506b58070254fa4f7dahrens ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
088f389458728c464569a5506b58070254fa4f7dahrens ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
088f389458728c464569a5506b58070254fa4f7dahrens ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
088f389458728c464569a5506b58070254fa4f7dahrens ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
088f389458728c464569a5506b58070254fa4f7dahrens prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
088f389458728c464569a5506b58070254fa4f7dahrens return (0);
088f389458728c464569a5506b58070254fa4f7dahrens (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
088f389458728c464569a5506b58070254fa4f7dahrens /* create the origin dir, ds, & snap-ds */
088f389458728c464569a5506b58070254fa4f7dahrens dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
088f389458728c464569a5506b58070254fa4f7dahrens VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
088f389458728c464569a5506b58070254fa4f7dahrens dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);