metaslab.c revision 09c9d376e8ccb8fbba74f33cc268964464092b62
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
/*
* Allow allocations to switch to gang blocks quickly. We do this to
* avoid having to load lots of space_maps in a given txg. There are,
* however, some cases where we want to avoid "fast" ganging and instead
* we want to do an exhaustive search of all metaslabs on this device.
* Currently we don't allow any gang or dump device related allocations
* to "fast" gang.
*/
#define CAN_FASTGANG(flags) \
/*
* This value defines the number of allowed allocation failures per vdev.
* If a device reaches this threshold in a given txg then we consider skipping
* allocations on that device.
*/
/*
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
*/
static int metaslab_debug = 0;
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
/*
* The minimum free space, in percent, which must be available
* in a space map to continue allocations in a first-fit fashion.
* Once the space_map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
int metaslab_df_free_pct = 4;
/*
* A metaslab is considered "free" if it contains a contiguous
* segment which is greater than metaslab_min_alloc_size.
*/
/*
* Max number of space_maps to prefetch.
*/
/*
* Percentage bonus multiplier for metaslabs that are in the bonus area.
*/
int metaslab_smo_bonus_pct = 150;
/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
{
return (mc);
}
void
{
}
int
{
/*
* Must hold one of the spa_config locks.
*/
return (0);
do {
return (0);
}
void
{
}
{
}
{
return (mc->mc_deferred);
}
{
}
{
}
/*
* ==========================================================================
* Metaslab groups
* ==========================================================================
*/
static int
{
return (1);
return (-1);
/*
* If the weights are identical, use the offset to force uniqueness.
*/
return (-1);
return (1);
return (0);
}
{
mg->mg_activation_count = 0;
return (mg);
}
void
{
/*
* We may have gone below zero with the activation count
* either because we never activated in the first place or
* because we're done, and possibly removing the vdev.
*/
}
void
{
if (++mg->mg_activation_count <= 0)
return;
} else {
}
}
void
{
if (--mg->mg_activation_count != 0) {
return;
}
} else {
}
}
static void
{
}
static void
{
}
static void
{
/*
* Although in principle the weight can be any value, in
* practice we do not use values in the range [1, 510].
*/
}
/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
*/
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
*/
static uint64_t
{
return (offset);
}
}
/*
* If we know we've searched the whole map (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0)
return (-1ULL);
*cursor = 0;
}
static void
{
}
static void
{
/* tear down the tree */
}
}
/* ARGSUSED */
static void
{
/* No need to update cursor */
}
/* ARGSUSED */
static void
{
/* No need to update cursor */
}
/*
* Return the maximum contiguous segment within the metaslab.
*/
{
return (0ULL);
}
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static uint64_t
{
}
/* ARGSUSED */
{
return (B_TRUE);
}
static space_map_ops_t metaslab_ff_ops = {
};
/*
* ==========================================================================
* Dynamic block allocator -
* Uses the first fit allocation scheme until space get low and then
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
* ==========================================================================
*/
static uint64_t
{
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
if (max_size < metaslab_df_alloc_threshold ||
t = sm->sm_pp_root;
*cursor = 0;
}
}
static boolean_t
{
if (max_size >= metaslab_df_alloc_threshold &&
return (B_FALSE);
return (B_TRUE);
}
static space_map_ops_t metaslab_df_ops = {
};
/*
* ==========================================================================
* Other experimental allocators
* ==========================================================================
*/
static uint64_t
{
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
t = sm->sm_pp_root;
*cursor = *extent_end = 0;
if (offset != -1)
} else {
}
return (offset);
}
static boolean_t
{
return (B_FALSE);
return (B_TRUE);
}
static space_map_ops_t metaslab_cdf_ops = {
};
static uint64_t
{
return (-1ULL);
t = sm->sm_pp_root;
}
}
}
return (-1ULL);
}
static boolean_t
{
return (B_FALSE);
return (B_TRUE);
}
static space_map_ops_t metaslab_ndf_ops = {
};
/*
* ==========================================================================
* Metaslabs
* ==========================================================================
*/
{
/*
* We create the main space map here, but we don't create the
* allocmaps and freemaps until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
}
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
* If we're adding space to an existing pool, the new space
* does not become available until after this txg has synced.
*/
if (txg <= TXG_INITIAL)
metaslab_sync_done(msp, 0);
if (txg != 0) {
}
return (msp);
}
void
{
for (int t = 0; t < TXG_SIZE; t++) {
}
for (int t = 0; t < TXG_DEFER_SIZE; t++)
}
#define METASLAB_ACTIVE_MASK \
static uint64_t
{
/*
* The baseline weight is the metaslab's free space.
*/
/*
* Modern disks have uniform bit density and constant angular velocity.
* Therefore, the outer recording zones are faster (higher bandwidth)
* than the inner zones by the ratio of outer to inner track diameter,
* which is typically around 2:1. We account for this by assigning
* higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
/*
* For locality, assign higher weight to metaslabs which have
* a lower offset than what we've already activated.
*/
/*
* If this metaslab is one we're actively using, adjust its
* weight to make it preferable to any inactive metaslab so
* we'll polish it off.
*/
}
return (weight);
}
static void
{
int m;
/*
* Prefetch the next potential metaslabs
*/
/* If we have reached our prefetch limit then we're done */
if (m >= metaslab_prefetch_limit)
break;
}
}
}
static int
{
if (error) {
return (error);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++)
}
/*
* Track the bonus area as we activate new metaslabs.
*/
}
}
return (0);
}
static void
{
/*
* If size < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
}
/*
* Write a metaslab to disk in the context of the specified transaction group.
*/
void
{
return;
/*
* The only state that can actually be changing concurrently with
* metaslab_sync() is the metaslab's ms_map. No other thread can
* be modifying this txg's allocmap, freemap, freed_map, or smo.
* Therefore, we only hold ms_lock to satify space_map ASSERTs.
* We drop it whenever we call into the DMU, because the DMU
* can call down to us (e.g. via zio_free()) at any time.
*/
if (smo->smo_object == 0) {
}
/*
* The in-core space map representation is twice as compact
* as the on-disk one, so it's time to condense the latter
* by generating a pure allocmap from first principles.
*
* This metaslab is 100% allocated,
* minus the content of the in-core map (sm),
* minus what's been freed this txg (freed_map),
* minus deferred frees (ms_defermap[]),
* minus allocations from txgs in the future
* (because they haven't been committed yet).
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++)
for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
}
}
/*
* Called after a transaction group has completely synced to mark
* all of the metaslab's free space as usable.
*/
void
{
/*
* If this metaslab is just becoming available, initialize its
* allocmaps and freemaps and add its capacity to the vdev.
*/
for (int t = 0; t < TXG_SIZE; t++) {
}
for (int t = 0; t < TXG_DEFER_SIZE; t++)
}
/*
* If there's a space_map_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
* Then, add defer_map (oldest deferred frees) to this map and
* transfer freed_map (this txg's frees) to defer_map.
*/
if (msp->ms_deferspace != 0) {
/*
* Keep syncing this metaslab until all deferred frees
* are back in circulation.
*/
}
/*
* If the map is loaded but no longer active, evict it as soon as all
* future allocations have synced. (If we unloaded it now and then
* loaded a moment later, the map wouldn't reflect those allocations.)
*/
int evictable = 1;
for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
evictable = 0;
if (evictable && !metaslab_debug)
}
}
void
{
/*
* Re-evaluate all metaslabs which have lower offsets than the
* bonus area.
*/
for (int m = 0; m < vd->vdev_ms_count; m++) {
break;
}
/*
* Prefetch the next potential metaslabs
*/
}
static uint64_t
{
return (1ULL << 63);
return (0);
}
static uint64_t
{
int i;
for (i = 0; i < d; i++) {
break;
}
}
for (;;) {
"requirement: vdev %llu, txg %llu, mg %p, "
"msp %p, psize %llu, asize %llu, "
"failures %llu, weight %llu",
return (-1ULL);
}
break;
for (i = 0; i < d; i++)
break;
if (i == d)
break;
}
return (-1ULL);
/*
* If we've already reached the allowable number of failed
* allocation attempts on this metaslab group then we
* consider skipping it. We skip it only if we're allowed
* to "fast" gang, the physical size is larger than
* a gang block, and we're attempting to allocate from
* the primary metaslab.
*/
"vdev %llu, txg %llu, mg %p, psize %llu, "
return (-1ULL);
}
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
continue;
}
continue;
}
continue;
}
break;
}
return (offset);
}
/*
* Allocate a block for the specified i/o.
*/
static int
{
int dshift = 3;
int all_zero;
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
return (ENOSPC);
/*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
* If we are doing ditto or log blocks, try to spread them across
* consecutive vdevs. If we're forced to reuse a vdev before we've
* allocated all of our ditto blocks, then try and spread them out on
* that vdev as much as possible. If it turns out to not be possible,
* gradually lower our standards until anything becomes acceptable.
* Also, allocating on consecutive vdevs (as opposed to random vdevs)
* gives us hope of containing our fault domains to something we're
* able to reason about. Otherwise, any two top-level vdev failures
* will guarantee the loss of data. With consecutive allocation,
* only two adjacent top-level vdev failures will result in data loss.
*
* If we are doing gang blocks (hintdva is non-NULL), try to keep
* ourselves on the same vdev as our gang block header. That
* way, we can hope for locality in vdev_cache, plus it makes our
* fault domains something tractable.
*/
if (hintdva) {
/*
* It's possible the vdev we're using as the hint no
* longer exists (i.e. removed). Consult the rotor when
* all else fails.
*/
if (flags & METASLAB_HINTBP_AVOID &&
} else {
}
} else if (d != 0) {
} else {
}
/*
* If the hint put us into the wrong metaslab class, or into a
* metaslab group that has been passivated, just follow the rotor.
*/
top:
do {
/*
* Don't allocate from faulted devices.
*/
if (zio_lock) {
} else {
}
if (!allocatable)
goto next;
/*
* Avoid writing single-copy data to a failing vdev
*/
d == 0 && dshift == 3) {
goto next;
}
distance = 0;
else
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
* figure out whether the corresponding vdev is
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
if (mc->mc_aliquot == 0) {
/*
* Calculate how much more or less we should
* try to allocate from this device during
* this iteration around the rotor.
* For example, if a device is 80% full
* and the pool is 20% full then we should
* reduce allocations by 60% on this device.
*
* mg_bias = (20 - 80) * 512K / 100 = -307K
*
* This reduces allocations by 307K for this
* iteration.
*/
}
mc->mc_aliquot = 0;
}
return (0);
}
next:
mc->mc_aliquot = 0;
if (!all_zero) {
dshift++;
goto top;
}
if (!allocatable && !zio_lock) {
dshift = 3;
goto top;
}
return (ENOSPC);
}
/*
* Free the block represented by DVA in the context of the specified
* transaction group.
*/
static void
{
return;
ASSERT(0);
return;
}
if (DVA_GET_GANG(dva))
if (now) {
} else {
}
}
/*
* Intent log support: upon opening the pool after a crash, notify the SPA
* of blocks that the intent log has allocated for immediate write, but
* which are still considered free by the SPA because the last transaction
* group didn't commit yet.
*/
static int
{
int error = 0;
return (ENXIO);
if (DVA_GET_GANG(dva))
return (error);
}
}
return (0);
}
int
{
int error = 0;
return (ENOSPC);
}
for (int d = 0; d < ndvas; d++) {
if (error) {
for (d--; d >= 0; d--) {
}
return (error);
}
}
return (0);
}
void
{
for (int d = 0; d < ndvas; d++)
}
int
{
int error = 0;
if (txg != 0) {
/*
* First do a dry run to make sure all DVAs are claimable,
* so we don't have to unwind from partial failures below.
*/
return (error);
}
for (int d = 0; d < ndvas; d++)
break;
return (error);
}