metaslab.c revision be6fd75a69ae679453d9cda5bff3326111e6d1ca
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
/*
* Allow allocations to switch to gang blocks quickly. We do this to
* avoid having to load lots of space_maps in a given txg. There are,
* however, some cases where we want to avoid "fast" ganging and instead
* we want to do an exhaustive search of all metaslabs on this device.
* Currently we don't allow any gang, zil, or dump device related allocations
* to "fast" gang.
*/
#define CAN_FASTGANG(flags) \
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
* space_map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
int zfs_condense_pct = 200;
/*
* This value defines the number of allowed allocation failures per vdev.
* If a device reaches this threshold in a given txg then we consider skipping
* allocations on that device.
*/
/*
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
*/
static int metaslab_debug = 0;
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
/*
* The minimum free space, in percent, which must be available
* in a space map to continue allocations in a first-fit fashion.
* Once the space_map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
int metaslab_df_free_pct = 4;
/*
* A metaslab is considered "free" if it contains a contiguous
* segment which is greater than metaslab_min_alloc_size.
*/
/*
* Max number of space_maps to prefetch.
*/
/*
* Percentage bonus multiplier for metaslabs that are in the bonus area.
*/
int metaslab_smo_bonus_pct = 150;
/*
* Should we be willing to write data to degraded vdevs?
*/
/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
{
return (mc);
}
void
{
}
int
{
/*
* Must hold one of the spa_config locks.
*/
return (0);
do {
return (0);
}
void
{
}
{
}
{
return (mc->mc_deferred);
}
{
}
{
}
/*
* ==========================================================================
* Metaslab groups
* ==========================================================================
*/
static int
{
return (1);
return (-1);
/*
* If the weights are identical, use the offset to force uniqueness.
*/
return (-1);
return (1);
return (0);
}
{
mg->mg_activation_count = 0;
return (mg);
}
void
{
/*
* We may have gone below zero with the activation count
* either because we never activated in the first place or
* because we're done, and possibly removing the vdev.
*/
}
void
{
if (++mg->mg_activation_count <= 0)
return;
} else {
}
}
void
{
if (--mg->mg_activation_count != 0) {
return;
}
} else {
}
}
static void
{
}
static void
{
}
static void
{
/*
* Although in principle the weight can be any value, in
* practice we do not use values in the range [1, 510].
*/
}
/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
*/
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
*/
static uint64_t
{
return (offset);
}
}
/*
* If we know we've searched the whole map (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0)
return (-1ULL);
*cursor = 0;
}
static void
{
}
static void
{
/* tear down the tree */
}
}
/* ARGSUSED */
static void
{
/* No need to update cursor */
}
/* ARGSUSED */
static void
{
/* No need to update cursor */
}
/*
* Return the maximum contiguous segment within the metaslab.
*/
{
return (0ULL);
}
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static uint64_t
{
}
/* ARGSUSED */
{
return (B_TRUE);
}
static space_map_ops_t metaslab_ff_ops = {
};
/*
* ==========================================================================
* Dynamic block allocator -
* Uses the first fit allocation scheme until space get low and then
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
* ==========================================================================
*/
static uint64_t
{
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
if (max_size < metaslab_df_alloc_threshold ||
t = sm->sm_pp_root;
*cursor = 0;
}
}
static boolean_t
{
if (max_size >= metaslab_df_alloc_threshold &&
return (B_FALSE);
return (B_TRUE);
}
static space_map_ops_t metaslab_df_ops = {
};
/*
* ==========================================================================
* Other experimental allocators
* ==========================================================================
*/
static uint64_t
{
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
t = sm->sm_pp_root;
*cursor = *extent_end = 0;
if (offset != -1)
} else {
}
return (offset);
}
static boolean_t
{
return (B_FALSE);
return (B_TRUE);
}
static space_map_ops_t metaslab_cdf_ops = {
};
static uint64_t
{
return (-1ULL);
t = sm->sm_pp_root;
}
}
}
return (-1ULL);
}
static boolean_t
{
return (B_FALSE);
return (B_TRUE);
}
static space_map_ops_t metaslab_ndf_ops = {
};
/*
* ==========================================================================
* Metaslabs
* ==========================================================================
*/
{
/*
* We create the main space map here, but we don't create the
* allocmaps and freemaps until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
}
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
* If we're adding space to an existing pool, the new space
* does not become available until after this txg has synced.
*/
if (txg <= TXG_INITIAL)
metaslab_sync_done(msp, 0);
if (txg != 0) {
}
return (msp);
}
void
{
for (int t = 0; t < TXG_SIZE; t++) {
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
#define METASLAB_ACTIVE_MASK \
static uint64_t
{
/*
* This vdev is in the process of being removed so there is nothing
* for us to do here.
*/
if (vd->vdev_removing) {
return (0);
}
/*
* The baseline weight is the metaslab's free space.
*/
/*
* Modern disks have uniform bit density and constant angular velocity.
* Therefore, the outer recording zones are faster (higher bandwidth)
* than the inner zones by the ratio of outer to inner track diameter,
* which is typically around 2:1. We account for this by assigning
* higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
/*
* For locality, assign higher weight to metaslabs which have
* a lower offset than what we've already activated.
*/
/*
* If this metaslab is one we're actively using, adjust its
* weight to make it preferable to any inactive metaslab so
* we'll polish it off.
*/
}
return (weight);
}
static void
{
int m;
/*
* Prefetch the next potential metaslabs
*/
/* If we have reached our prefetch limit then we're done */
if (m >= metaslab_prefetch_limit)
break;
}
}
}
static int
{
if (error) {
return (error);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++)
}
/*
* Track the bonus area as we activate new metaslabs.
*/
}
}
return (0);
}
static void
{
/*
* If size < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
}
/*
* Determine if the in-core space map representation can be condensed on-disk.
* We would like to use the following criteria to make our decision:
*
* 1. The size of the space map object should not dramatically increase as a
* result of writing out our in-core free map.
*
* 2. The minimal on-disk space map representation is zfs_condense_pct/100
* times the size than the in-core representation (i.e. zfs_condense_pct = 110
* and in-core = 1MB, minimal = 1.1.MB).
*
* Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we
* use the size-ordered AVL tree in the space map and calculate the
* size required for the largest segment in our in-core free map. If the
* size required to represent that segment on disk is larger than the space
* map object then we avoid condensing this map.
*
* To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form.
*/
static boolean_t
{
/*
* Use the sm_pp_root AVL tree, which is ordered by size, to obtain
* the largest segment in the in-core free map. If the tree is
* empty then we should condense the map.
*/
return (B_TRUE);
/*
* Calculate the number of 64-bit entries this segment would
* require when written to disk. If this single segment would be
* larger on-disk than the entire current on-disk structure, then
* clearly condensing will increase the on-disk structure size.
*/
}
/*
* Condense the on-disk space map representation to its minimized form.
* The minimized form consists of a small number of allocations followed by
* the in-core free map.
*/
static void
{
"smo size %llu, segments %lu", txg,
/*
* Create an map that is a 100% allocated map. We remove segments
* that have been freed in this txg, any deferred frees that exist,
* and any allocation in the future. Removing segments should be
* a relatively inexpensive operation since we expect these maps to
* a small number of nodes.
*/
/*
* Remove what's been freed in this txg from the condense_map.
* Since we're in sync_pass 1, we know that all the frees from
* this txg are in the freemap.
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++)
for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
/*
* We're about to drop the metaslab's lock thus allowing
* other consumers to change it's content. Set the
* space_map's sm_condensing flag to ensure that
* allocations on this metaslab do not occur while we're
* in the middle of committing it to disk. This is only critical
* for the ms_map as all other space_maps use per txg
* views of their content.
*/
/*
* While we would ideally like to create a space_map representation
* that consists only of allocation records, doing so can be
* prohibitively expensive because the in-core free map can be
* large, and therefore computationally expensive to subtract
* from the condense_map. Instead we sync out two maps, a cheap
* allocation only map followed by the in-core free map. While not
* optimal, this is typically close to optimal, and much cheaper to
* compute.
*/
"smo size %llu", txg,
smo->smo_objsize);
}
/*
* Write a metaslab to disk in the context of the specified transaction group.
*/
void
{
/*
* This metaslab has just been added so there's no work to do now.
*/
return;
}
return;
/*
* The only state that can actually be changing concurrently with
* metaslab_sync() is the metaslab's ms_map. No other thread can
* be modifying this txg's allocmap, freemap, freed_map, or smo.
* Therefore, we only hold ms_lock to satify space_map ASSERTs.
* We drop it whenever we call into the DMU, because the DMU
* can call down to us (e.g. via zio_free()) at any time.
*/
if (smo->smo_object == 0) {
}
} else {
}
/*
* For sync pass 1, we avoid walking the entire space map and
* instead will just swap the pointers for freemap and
* freed_map. We can safely do this since the freed_map is
* guaranteed to be empty on the initial pass.
*/
} else {
}
}
/*
* Called after a transaction group has completely synced to mark
* all of the metaslab's free space as usable.
*/
void
{
/*
* If this metaslab is just becoming available, initialize its
* allocmaps, freemaps, and defermap and add its capacity to the vdev.
*/
for (int t = 0; t < TXG_SIZE; t++) {
KM_SLEEP);
KM_SLEEP);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
KM_SLEEP);
}
}
/*
* If there's a space_map_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
*/
/*
* Move the frees from the defer_map to this map (if it's loaded).
* Swap the freed_map and the defer_map -- this is safe to do
* because we've just emptied out the defer_map.
*/
if (msp->ms_deferspace != 0) {
/*
* Keep syncing this metaslab until all deferred frees
* are back in circulation.
*/
}
/*
* If the map is loaded but no longer active, evict it as soon as all
* future allocations have synced. (If we unloaded it now and then
* loaded a moment later, the map wouldn't reflect those allocations.)
*/
int evictable = 1;
for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
evictable = 0;
if (evictable && !metaslab_debug)
}
}
void
{
/*
* Re-evaluate all metaslabs which have lower offsets than the
* bonus area.
*/
for (int m = 0; m < vd->vdev_ms_count; m++) {
break;
}
/*
* Prefetch the next potential metaslabs
*/
}
static uint64_t
{
return (1ULL << 63);
return (0);
}
static uint64_t
{
int i;
for (i = 0; i < d; i++) {
break;
}
}
for (;;) {
"requirement: vdev %llu, txg %llu, mg %p, "
"msp %p, psize %llu, asize %llu, "
"failures %llu, weight %llu",
return (-1ULL);
}
/*
* If the selected metaslab is condensing, skip it.
*/
continue;
break;
for (i = 0; i < d; i++)
break;
if (i == d)
break;
}
return (-1ULL);
/*
* If we've already reached the allowable number of failed
* allocation attempts on this metaslab group then we
* consider skipping it. We skip it only if we're allowed
* to "fast" gang, the physical size is larger than
* a gang block, and we're attempting to allocate from
* the primary metaslab.
*/
"vdev %llu, txg %llu, mg %p, psize %llu, "
return (-1ULL);
}
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
continue;
}
continue;
}
continue;
}
/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
* to disk.
*/
continue;
}
break;
}
return (offset);
}
/*
* Allocate a block for the specified i/o.
*/
static int
{
int dshift = 3;
int all_zero;
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
/*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
* If we are doing ditto or log blocks, try to spread them across
* consecutive vdevs. If we're forced to reuse a vdev before we've
* allocated all of our ditto blocks, then try and spread them out on
* that vdev as much as possible. If it turns out to not be possible,
* gradually lower our standards until anything becomes acceptable.
* Also, allocating on consecutive vdevs (as opposed to random vdevs)
* gives us hope of containing our fault domains to something we're
* able to reason about. Otherwise, any two top-level vdev failures
* will guarantee the loss of data. With consecutive allocation,
* only two adjacent top-level vdev failures will result in data loss.
*
* If we are doing gang blocks (hintdva is non-NULL), try to keep
* ourselves on the same vdev as our gang block header. That
* way, we can hope for locality in vdev_cache, plus it makes our
* fault domains something tractable.
*/
if (hintdva) {
/*
* It's possible the vdev we're using as the hint no
* longer exists (i.e. removed). Consult the rotor when
* all else fails.
*/
if (flags & METASLAB_HINTBP_AVOID &&
} else {
}
} else if (d != 0) {
} else {
}
/*
* If the hint put us into the wrong metaslab class, or into a
* metaslab group that has been passivated, just follow the rotor.
*/
top:
do {
/*
* Don't allocate from faulted devices.
*/
if (zio_lock) {
} else {
}
if (!allocatable)
goto next;
/*
* Avoid writing single-copy data to a failing vdev
* unless the user instructs us that it is okay.
*/
d == 0 && dshift == 3 &&
goto next;
}
distance = 0;
else
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
* figure out whether the corresponding vdev is
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
if (mc->mc_aliquot == 0) {
/*
* Calculate how much more or less we should
* try to allocate from this device during
* this iteration around the rotor.
* For example, if a device is 80% full
* and the pool is 20% full then we should
* reduce allocations by 60% on this device.
*
* mg_bias = (20 - 80) * 512K / 100 = -307K
*
* This reduces allocations by 307K for this
* iteration.
*/
}
mc->mc_aliquot = 0;
}
return (0);
}
next:
mc->mc_aliquot = 0;
if (!all_zero) {
dshift++;
goto top;
}
if (!allocatable && !zio_lock) {
dshift = 3;
goto top;
}
}
/*
* Free the block represented by DVA in the context of the specified
* transaction group.
*/
static void
{
return;
ASSERT(0);
return;
}
if (DVA_GET_GANG(dva))
if (now) {
} else {
}
}
/*
* Intent log support: upon opening the pool after a crash, notify the SPA
* of blocks that the intent log has allocated for immediate write, but
* which are still considered free by the SPA because the last transaction
* group didn't commit yet.
*/
static int
{
int error = 0;
if (DVA_GET_GANG(dva))
return (error);
}
}
return (0);
}
int
{
int error = 0;
}
for (int d = 0; d < ndvas; d++) {
if (error) {
for (d--; d >= 0; d--) {
}
return (error);
}
}
return (0);
}
void
{
for (int d = 0; d < ndvas; d++)
}
int
{
int error = 0;
if (txg != 0) {
/*
* First do a dry run to make sure all DVAs are claimable,
* so we don't have to unwind from partial failures below.
*/
return (error);
}
for (int d = 0; d < ndvas; d++)
break;
return (error);
}
static void
{
}
void
{
if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
return;
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
for (int j = 0; j < TXG_SIZE; j++)
for (int j = 0; j < TXG_DEFER_SIZE; j++)
}
}