metaslab.c revision b1be2892dd07cf9a97d47ad06334cdc879196aaf
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
#include <sys/spa_impl.h>
#include <sys/zfeature.h>
/*
* Allow allocations to switch to gang blocks quickly. We do this to
* avoid having to load lots of space_maps in a given txg. There are,
* however, some cases where we want to avoid "fast" ganging and instead
* we want to do an exhaustive search of all metaslabs on this device.
* Currently we don't allow any gang, slog, or dump device related allocations
* to "fast" gang.
*/
#define CAN_FASTGANG(flags) \
#define METASLAB_ACTIVE_MASK \
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
* space_map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
int zfs_condense_pct = 200;
/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
* space used on disk. In particular, a space map uses data in increments of
* MAX(1 << ashift, space_map_blksize), so a metaslab might use the
* same number of blocks after condensing. Since the goal of condensing is to
* reduce the number of IOPs required to read the space map, we only want to
* condense when we can be sure we will reduce the number of blocks used by the
* space map. Unfortunately, we cannot precisely compute whether or not this is
* the case in metaslab_should_condense since we are holding ms_lock. Instead,
* we apply the following heuristic: do not condense a spacemap unless the
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
* blocks.
*/
/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of
* free space. Metaslab groups that have more free space than
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
* a metaslab group's free space is less than or equal to the
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
* groups are allowed to accept allocations. Gang blocks are always
* eligible to allocate on any metaslab group. The default value of 0 means
* no metaslab group will be excluded based on this criterion.
*/
int zfs_mg_noalloc_threshold = 0;
/*
* Metaslab groups are considered eligible for allocations if their
* fragmenation metric (measured as a percentage) is less than or equal to
* zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
* then it will be skipped unless all metaslab groups within the metaslab
* class have also crossed this threshold.
*/
int zfs_mg_fragmentation_threshold = 85;
/*
* Allow metaslabs to keep their active state as long as their fragmentation
* percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
* active metaslab that exceeds this threshold will no longer keep its active
* status allowing better metaslabs to be selected.
*/
int zfs_metaslab_fragmentation_threshold = 70;
/*
* When set will load all metaslabs when pool is first opened.
*/
int metaslab_debug_load = 0;
/*
* When set will prevent metaslabs from being unloaded.
*/
int metaslab_debug_unload = 0;
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
/*
* The minimum free space, in percent, which must be available
* in a space map to continue allocations in a first-fit fashion.
* Once the space_map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
int metaslab_df_free_pct = 4;
/*
* A metaslab is considered "free" if it contains a contiguous
* segment which is greater than metaslab_min_alloc_size.
*/
/*
* Percentage of all cpus that can be used by the metaslab taskq.
*/
int metaslab_load_pct = 50;
/*
* Determines how many txgs a metaslab may remain loaded without having any
* allocations from it. As long as a metaslab continues to be used we will
* keep it loaded.
*/
/*
* Max number of metaslabs per group to preload.
*/
/*
*/
/*
*/
/*
*/
/*
*/
/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
{
return (mc);
}
void
{
}
int
{
/*
* Must hold one of the spa_config locks.
*/
return (0);
do {
return (0);
}
void
{
}
{
}
{
return (mc->mc_deferred);
}
{
}
{
}
void
{
int i;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
KM_SLEEP);
for (int c = 0; c < rvd->vdev_children; c++) {
/*
* Skip any holes, uninitialized top-levels, or
* vdevs that are not in this metalab class.
*/
continue;
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
}
/*
* Calculate the metaslab class's fragmentation metric. The metric
* is weighted based on the space contribution of each metaslab group.
* The return value will be a number between 0 and 100 (inclusive), or
* ZFS_FRAG_INVALID if the metric has not been set. See comment above the
* zfs_frag_table for more information about the metric.
*/
{
uint64_t fragmentation = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
/*
* Skip any holes, uninitialized top-levels, or
* vdevs that are not in this metalab class.
*/
continue;
}
/*
* If a metaslab group does not contain a fragmentation
* metric then just bail out.
*/
return (ZFS_FRAG_INVALID);
}
/*
* Determine how much this metaslab_group is contributing
* to the overall pool fragmentation metric.
*/
}
return (fragmentation);
}
/*
* Calculate the amount of expandable space that is available in
* this metaslab class. If a device is expanded then its expandable
* space will be the amount of allocatable space that is currently not
* part of this metaslab class.
*/
{
for (int c = 0; c < rvd->vdev_children; c++) {
continue;
}
}
return (space);
}
/*
* ==========================================================================
* Metaslab groups
* ==========================================================================
*/
static int
{
return (1);
return (-1);
/*
* If the weights are identical, use the offset to force uniqueness.
*/
return (-1);
return (1);
return (0);
}
/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
* the zfs_mg_noalloc_threshold. If a metaslab group transitions
* from allocatable to non-allocatable or vice versa then the metaslab
* group's class is updated to reflect the transition.
*/
static void
{
/*
* A metaslab group is considered allocatable if it has plenty
* of free space or is not heavily fragmented. We only take
* fragmentation into account if the metaslab group has a valid
* fragmentation metric (i.e. a value between 0 and 100).
*/
/*
* The mc_alloc_groups maintains a count of the number of
* groups in this metaslab class that are still above the
* zfs_mg_noalloc_threshold. This is used by the allocating
* threads to determine if they should avoid allocations to
* a given group. The allocator will avoid allocations to a group
* if that group has reached or is below the zfs_mg_noalloc_threshold
* and there are still other groups that are above the threshold.
* When a group transitions from allocatable to non-allocatable or
* vice versa we update the metaslab class to reflect that change.
* When the mc_alloc_groups value drops to 0 that means that all
* groups have reached the zfs_mg_noalloc_threshold making all groups
* eligible for allocations. This effectively means that all devices
* are balanced again.
*/
mc->mc_alloc_groups--;
mc->mc_alloc_groups++;
}
{
mg->mg_activation_count = 0;
return (mg);
}
void
{
/*
* We may have gone below zero with the activation count
* either because we never activated in the first place or
* because we're done, and possibly removing the vdev.
*/
}
void
{
if (++mg->mg_activation_count <= 0)
return;
} else {
}
}
void
{
if (--mg->mg_activation_count != 0) {
return;
}
} else {
}
}
{
}
void
{
int i;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
KM_SLEEP);
for (int m = 0; m < vd->vdev_ms_count; m++) {
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
}
static void
{
return;
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
}
}
void
{
return;
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
}
}
static void
{
}
static void
{
}
static void
{
/*
* Although in principle the weight can be any value, in
* practice we do not use values in the range [1, 511].
*/
}
/*
* Calculate the fragmentation for a given metaslab group. We can use
* a simple average here since all metaslabs within the group must have
* the same size. The return value will be a value between 0 and 100
* (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
* group have a fragmentation metric.
*/
{
uint64_t fragmentation = 0;
for (int m = 0; m < vd->vdev_ms_count; m++) {
continue;
valid_ms++;
}
return (ZFS_FRAG_INVALID);
return (fragmentation);
}
/*
* Determine if a given metaslab group should skip allocations. A metaslab
* group should avoid allocations if its free capacity is less than the
* zfs_mg_noalloc_threshold or its fragmentation metric is greater than
* zfs_mg_fragmentation_threshold and there is at least one metaslab group
* that can still handle allocations.
*/
static boolean_t
{
/*
* We use two key metrics to determine if a metaslab group is
* considered allocatable -- free space and fragmentation. If
* the free space is greater than the free space threshold and
* the fragmentation is less than the fragmentation threshold then
* consider the group allocatable. There are two case when we will
* not consider these key metrics. The first is if the group is
* associated with a slog device and the second is if all groups
* in this metaslab class have already been consider ineligible
* for allocations.
*/
}
/*
* ==========================================================================
* Range tree callbacks
* ==========================================================================
*/
/*
* Comparison function for the private size-ordered tree. Tree is sorted
* by size, larger sizes at the end of the tree.
*/
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
*/
static void
{
}
/*
* Destroy the block allocator specific components.
*/
static void
{
}
static void
{
}
static void
{
}
static void
{
/*
* Normally one would walk the tree freeing nodes along the way.
* Since the nodes are shared with the range trees we can avoid
* walking all nodes and just reinitialize the avl tree. The nodes
* will be freed by the range tree, so we don't want to free them here.
*/
}
static range_tree_ops_t metaslab_rt_ops = {
};
/*
* ==========================================================================
* Metaslab block operations
* ==========================================================================
*/
/*
* Return the maximum contiguous segment within the metaslab.
*/
{
return (0ULL);
}
{
if (start != -1ULL) {
}
return (start);
}
/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
*/
/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
*/
static uint64_t
{
return (offset);
}
}
/*
* If we know we've searched the whole map (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0)
return (-1ULL);
*cursor = 0;
}
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static uint64_t
{
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area of the metaslab (i.e. same cursor
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
}
static metaslab_ops_t metaslab_ff_ops = {
};
/*
* ==========================================================================
* Dynamic block allocator -
* Uses the first fit allocation scheme until space get low and then
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
* ==========================================================================
*/
static uint64_t
{
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area of the metaslab (i.e. same cursor
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
if (max_size < metaslab_df_alloc_threshold ||
t = &msp->ms_size_tree;
*cursor = 0;
}
}
static metaslab_ops_t metaslab_df_ops = {
};
/*
* ==========================================================================
* Cursor fit block allocator -
* Select the largest region in the metaslab, set the cursor to the beginning
* of the range and the cursor_end to the end of the range. As allocations
* are made advance the cursor. Continue allocating from the cursor until
* the range is exhausted and then find a new range.
* ==========================================================================
*/
static uint64_t
{
return (-1ULL);
}
return (offset);
}
static metaslab_ops_t metaslab_cf_ops = {
};
/*
* ==========================================================================
* New dynamic fit allocator -
* Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
* contiguous blocks. If no region is found then just use the largest segment
* that remains.
* ==========================================================================
*/
/*
* Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
* to request from the allocator.
*/
static uint64_t
{
return (-1ULL);
t = &msp->ms_size_tree;
}
}
return (-1ULL);
}
static metaslab_ops_t metaslab_ndf_ops = {
};
/*
* ==========================================================================
* Metaslabs
* ==========================================================================
*/
/*
* Wait for any in-progress metaslab loads to complete.
*/
void
{
while (msp->ms_loading) {
}
}
int
{
int error = 0;
/*
* If the space map has not been allocated yet, then treat
* all the space in the metaslab as free and add it to the
* ms_tree.
*/
else
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
return (error);
}
void
{
}
{
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
*/
if (object != 0) {
}
/*
* We create the main range tree here, but we don't create the
* alloctree and freetree until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
* If we're adding space to an existing pool, the new space
* does not become available until after this txg has synced.
*/
if (txg <= TXG_INITIAL)
metaslab_sync_done(msp, 0);
/*
* If metaslab_debug_load is set and we're initializing a metaslab
* that has an allocated space_map object then load the its space
* map so that can verify frees.
*/
}
if (txg != 0) {
}
return (msp);
}
void
{
for (int t = 0; t < TXG_SIZE; t++) {
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
#define FRAGMENTATION_TABLE_SIZE 17
/*
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
* multiplying that by the fragmetation metric in this table. Doing
* this for all buckets and dividing it by the total amount of free
* space in this metaslab (i.e. the total free space in all buckets) gives
* us the fragmentation metric. This means that a high fragmentation metric
* equates to most of the free space being comprised of small segments.
* Conversely, if the metric is low, then most of the free space is in
* large segments. A 10% change in fragmentation equates to approximately
* double the number of segments.
*
* This table defines 0% fragmented space using 16MB segments. Testing has
* shown that segments that are greater than or equal to 16MB do not suffer
* from drastic performance problems. Using this value, we derive the rest
* of the table. Since the fragmentation value is never stored on disk, it
* is possible to change these calculations in the future.
*/
int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
100, /* 512B */
100, /* 1K */
98, /* 2K */
95, /* 4K */
90, /* 8K */
80, /* 16K */
70, /* 32K */
60, /* 64K */
50, /* 128K */
40, /* 256K */
30, /* 512K */
20, /* 1M */
15, /* 2M */
10, /* 4M */
5, /* 8M */
0 /* 16M */
};
/*
* Calclate the metaslab's fragmentation metric. A return value
* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
* not support this metric. Otherwise, the return value should be in the
* range [0, 100].
*/
static uint64_t
{
uint64_t fragmentation = 0;
if (!feature_enabled)
return (ZFS_FRAG_INVALID);
/*
* A null space map means that the entire metaslab is free
* and thus is not fragmented.
*/
return (0);
/*
* If this metaslab's space_map has not been upgraded, flag it
* so that we upgrade next time we encounter it.
*/
if (spa_writeable(spa)) {
}
return (ZFS_FRAG_INVALID);
}
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
FRAGMENTATION_TABLE_SIZE - 1);
continue;
}
if (total > 0)
fragmentation /= total;
return (fragmentation);
}
/*
* Compute a weight -- a selection preference value -- for the given metaslab.
* This is based on the amount of free space, the level of fragmentation,
* the LBA range, and whether the metaslab is loaded.
*/
static uint64_t
{
/*
* This vdev is in the process of being removed so there is nothing
* for us to do here.
*/
if (vd->vdev_removing) {
return (0);
}
/*
* The baseline weight is the metaslab's free space.
*/
/*
* Use the fragmentation information to inversely scale
* down the baseline weight. We need to ensure that we
* don't exclude this metaslab completely when it's 100%
* fragmented. To avoid this we reduce the fragmented value
* by 1.
*/
/*
* If space < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. The fragmentation metric may have
* decreased the space to something smaller than
* SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
* so that we can consume any remaining space.
*/
}
/*
* Modern disks have uniform bit density and constant angular velocity.
* Therefore, the outer recording zones are faster (higher bandwidth)
* than the inner zones by the ratio of outer to inner track diameter,
* which is typically around 2:1. We account for this by assigning
* higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
}
/*
* If this metaslab is one we're actively using, adjust its
* weight to make it preferable to any inactive metaslab so
* we'll polish it off. If the fragmentation on this metaslab
* has exceed our threshold, then don't mark it active.
*/
}
return (weight);
}
static int
{
if (error) {
return (error);
}
}
}
return (0);
}
static void
{
/*
* If size < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
}
static void
metaslab_preload(void *arg)
{
(void) metaslab_load(msp);
/*
* Set the ms_access_txg value so that we don't unload it right away.
*/
}
static void
{
int m = 0;
return;
}
/*
* Load the next potential metaslabs
*/
/*
* We preload only the maximum number of metaslabs specified
* by metaslab_preload_limit. If a metaslab is being forced
* to condense then we preload it too. This will ensure
* that force condensing happens in the next txg.
*/
continue;
}
/*
* We must drop the metaslab group lock here to preserve
* lock ordering with the ms_lock (when grabbing both
* the mg_lock and the ms_lock, the ms_lock must be taken
* first). As a result, it is possible that the ordering
* of the metaslabs within the avl tree may change before
* we reacquire the lock. The metaslab cannot be removed from
* the tree while we're in syncing context so it is safe to
* drop the mg_lock here. If the metaslabs are reordered
* nothing will break -- we just may end up loading a
* less than optimal one.
*/
}
}
/*
* Determine if the space map's on-disk footprint is past our tolerance
* for inefficiency. We would like to use the following criteria to make
* our decision:
*
* 1. The size of the space map object should not dramatically increase as a
* result of writing out the free space range tree.
*
* 2. The minimal on-disk space map representation is zfs_condense_pct/100
* times the size than the free space range tree representation
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
*
* 3. The on-disk size of the space map should actually decrease.
*
* Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we
* use the size-ordered range tree in the metaslab and calculate the
* size required to write out the largest segment in our free tree. If the
* size required to represent that segment on disk is larger than the space
* map object then we avoid condensing this map.
*
* To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form.
*
* Unfortunately, we cannot compute the on-disk size of the space map in this
* context because we cannot accurately compute the effects of compression, etc.
* Instead, we apply the heuristic described in the block comment for
* zfs_metaslab_condense_block_threshold - we only condense if the space used
* is greater than a threshold number of blocks.
*/
static boolean_t
{
/*
* Use the ms_size_tree range tree, which is ordered by size, to
* obtain the largest segment in the free tree. We always condense
* metaslabs that are empty and metaslabs for which a condense
* request has been made.
*/
return (B_TRUE);
/*
* Calculate the number of 64-bit entries this segment would
* require when written to disk. If this single segment would be
* larger on-disk than the entire current on-disk structure, then
* clearly condensing will increase the on-disk structure size.
*/
return (segsz <= object_size &&
}
/*
* Condense the on-disk space map representation to its minimized form.
* The minimized form consists of a small number of allocations followed by
* the entries of the free range tree.
*/
static void
{
"smp size %llu, segments %lu, forcing condense=%s", txg,
/*
* Create an range tree that is 100% allocated. We remove segments
* that have been freed in this txg, any deferred frees that exist,
* and any allocation in the future. Removing segments should be
* a relatively inexpensive operation since we expect these trees to
* have a small number of nodes.
*/
/*
* Remove what's been freed in this txg from the condense_tree.
* Since we're in sync_pass 1, we know that all the frees from
* this txg are in the freetree.
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
}
/*
* We're about to drop the metaslab's lock thus allowing
* other consumers to change it's content. Set the
* metaslab's ms_condensing flag to ensure that
* allocations on this metaslab do not occur while we're
* in the middle of committing it to disk. This is only critical
* for the ms_tree as all other range trees use per txg
* views of their content.
*/
/*
* While we would ideally like to create a space_map representation
* that consists only of allocation records, doing so can be
* prohibitively expensive because the in-core free tree can be
* large, and therefore computationally expensive to subtract
* from the condense_tree. Instead we sync out two trees, a cheap
* allocation only tree followed by the in-core free tree. While not
* optimal, this is typically close to optimal, and much cheaper to
* compute.
*/
}
/*
* Write a metaslab to disk in the context of the specified transaction group.
*/
void
{
/*
* This metaslab has just been added so there's no work to do now.
*/
return;
}
/*
* Normally, we don't want to process a metaslab if there
* are no allocations or frees to perform. However, if the metaslab
* is being forced to condense we need to let it through.
*/
if (range_tree_space(alloctree) == 0 &&
range_tree_space(*freetree) == 0 &&
return;
/*
* The only state that can actually be changing concurrently with
* metaslab_sync() is the metaslab's ms_tree. No other thread can
* be modifying this txg's alloctree, freetree, freed_tree, or
* space_map_phys_t. Therefore, we only hold ms_lock to satify
* space_map ASSERTs. We drop it whenever we call into the DMU,
* because the DMU can call down to us (e.g. via zio_free()) at
* any time.
*/
VERIFY3U(new_object, !=, 0);
}
/*
* Note: metaslab_condense() clears the space_map's histogram.
* Therefore we must verify and remove this histogram before
* condensing.
*/
} else {
}
/*
* When the space map is loaded, we have an accruate
* histogram in the range tree. This gives us an opportunity
* to bring the space map's histogram up-to-date so we clear
* it first before updating it.
*/
} else {
/*
* Since the space map is not loaded we simply update the
* exisiting histogram with what was freed in this txg. This
* means that the on-disk histogram may not have an accurate
* view of the free space but it's close enough to allow
* us to make allocation decisions.
*/
}
/*
* For sync pass 1, we avoid traversing this txg's free range tree
* and instead will just swap the pointers for freetree and
* freed_tree. We can safely do this since the freed_tree is
* guaranteed to be empty on the initial pass.
*/
} else {
}
}
}
/*
* Called after a transaction group has completely synced to mark
* all of the metaslab's free space as usable.
*/
void
{
/*
* If this metaslab is just becoming available, initialize its
* alloctrees, freetrees, and defertree and add its capacity to
* the vdev.
*/
for (int t = 0; t < TXG_SIZE; t++) {
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
/*
* If there's a metaslab_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
*/
/*
* Move the frees from the defer_tree back to the free
* range tree (if it's loaded). Swap the freed_tree and the
* defer_tree -- this is safe to do because we've just emptied out
* the defer_tree.
*/
if (msp->ms_deferspace != 0) {
/*
* Keep syncing this metaslab until all deferred frees
* are back in circulation.
*/
}
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
}
if (!metaslab_debug_unload)
}
}
void
{
/*
* Preload the next potential metaslabs
*/
}
static uint64_t
{
return (1ULL << 63);
return (0);
}
static uint64_t
{
int i;
for (i = 0; i < d; i++) {
break;
}
}
for (;;) {
"requirement: vdev %llu, txg %llu, mg %p, "
"msp %p, psize %llu, asize %llu, "
return (-1ULL);
}
/*
* If the selected metaslab is condensing, skip it.
*/
if (msp->ms_condensing)
continue;
break;
min_distance >> 1);
for (i = 0; i < d; i++)
break;
if (i == d)
break;
}
return (-1ULL);
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
continue;
}
continue;
}
continue;
}
/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
* to disk.
*/
if (msp->ms_condensing) {
continue;
}
break;
}
return (offset);
}
/*
* Allocate a block for the specified i/o.
*/
static int
{
int dshift = 3;
int all_zero;
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
/*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
* If we are doing ditto or log blocks, try to spread them across
* consecutive vdevs. If we're forced to reuse a vdev before we've
* allocated all of our ditto blocks, then try and spread them out on
* that vdev as much as possible. If it turns out to not be possible,
* gradually lower our standards until anything becomes acceptable.
* Also, allocating on consecutive vdevs (as opposed to random vdevs)
* gives us hope of containing our fault domains to something we're
* able to reason about. Otherwise, any two top-level vdev failures
* will guarantee the loss of data. With consecutive allocation,
* only two adjacent top-level vdev failures will result in data loss.
*
* If we are doing gang blocks (hintdva is non-NULL), try to keep
* ourselves on the same vdev as our gang block header. That
* way, we can hope for locality in vdev_cache, plus it makes our
* fault domains something tractable.
*/
if (hintdva) {
/*
* It's possible the vdev we're using as the hint no
* longer exists (i.e. removed). Consult the rotor when
* all else fails.
*/
if (flags & METASLAB_HINTBP_AVOID &&
} else {
}
} else if (d != 0) {
} else {
}
/*
* If the hint put us into the wrong metaslab class, or into a
* metaslab group that has been passivated, just follow the rotor.
*/
top:
do {
/*
* Don't allocate from faulted devices.
*/
if (zio_lock) {
} else {
}
/*
* Determine if the selected metaslab group is eligible
* for allocations. If we're ganging or have requested
* an allocation for the smallest gang block size
* then we don't want to avoid allocating to the this
* metaslab group. If we're in this condition we should
* try to allocate from any device possible so that we
* don't inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
if (!allocatable)
goto next;
/*
* Avoid writing single-copy data to a failing vdev
* unless the user instructs us that it is okay.
*/
goto next;
}
distance = 0;
else
dva, d);
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
* figure out whether the corresponding vdev is
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
/*
* Calculate how much more or less we should
* try to allocate from this device during
* this iteration around the rotor.
* For example, if a device is 80% full
* and the pool is 20% full then we should
* reduce allocations by 60% on this device.
*
* mg_bias = (20 - 80) * 512K / 100 = -307K
*
* This reduces allocations by 307K for this
* iteration.
*/
} else if (!metaslab_bias_enabled) {
}
mc->mc_aliquot = 0;
}
return (0);
}
next:
mc->mc_aliquot = 0;
if (!all_zero) {
dshift++;
goto top;
}
if (!allocatable && !zio_lock) {
dshift = 3;
goto top;
}
}
/*
* Free the block represented by DVA in the context of the specified
* transaction group.
*/
static void
{
return;
ASSERT(0);
return;
}
if (DVA_GET_GANG(dva))
if (now) {
} else {
}
}
/*
* Intent log support: upon opening the pool after a crash, notify the SPA
* of blocks that the intent log has allocated for immediate write, but
* which are still considered free by the SPA because the last transaction
* group didn't commit yet.
*/
static int
{
int error = 0;
if (DVA_GET_GANG(dva))
return (error);
}
}
return (0);
}
int
{
int error = 0;
}
for (int d = 0; d < ndvas; d++) {
if (error != 0) {
for (d--; d >= 0; d--) {
}
return (error);
}
}
return (0);
}
void
{
for (int d = 0; d < ndvas; d++)
}
int
{
int error = 0;
if (txg != 0) {
/*
* First do a dry run to make sure all DVAs are claimable,
* so we don't have to unwind from partial failures below.
*/
return (error);
}
for (int d = 0; d < ndvas; d++)
break;
return (error);
}
void
{
if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
return;
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
for (int j = 0; j < TXG_SIZE; j++)
for (int j = 0; j < TXG_DEFER_SIZE; j++)
}
}