/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
#include <sys/spa_impl.h>
#include <sys/zfeature.h>
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
* space map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
* space used on disk. In particular, a space map uses data in increments of
* MAX(1 << ashift, space_map_blksize), so a metaslab might use the
* same number of blocks after condensing. Since the goal of condensing is to
* reduce the number of IOPs required to read the space map, we only want to
* condense when we can be sure we will reduce the number of blocks used by the
* space map. Unfortunately, we cannot precisely compute whether or not this is
* the case in metaslab_should_condense since we are holding ms_lock. Instead,
* we apply the following heuristic: do not condense a spacemap unless the
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
* blocks.
*/
/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of
* free space. Metaslab groups that have more free space than
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
* a metaslab group's free space is less than or equal to the
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
* groups are allowed to accept allocations. Gang blocks are always
* eligible to allocate on any metaslab group. The default value of 0 means
* no metaslab group will be excluded based on this criterion.
*/
int zfs_mg_noalloc_threshold = 0;
/*
* Metaslab groups are considered eligible for allocations if their
* fragmenation metric (measured as a percentage) is less than or equal to
* zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
* then it will be skipped unless all metaslab groups within the metaslab
* class have also crossed this threshold.
*/
/*
* Allow metaslabs to keep their active state as long as their fragmentation
* percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
* active metaslab that exceeds this threshold will no longer keep its active
* status allowing better metaslabs to be selected.
*/
/*
* When set will load all metaslabs when pool is first opened.
*/
int metaslab_debug_load = 0;
/*
* When set will prevent metaslabs from being unloaded.
*/
int metaslab_debug_unload = 0;
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
/*
* The minimum free space, in percent, which must be available
* in a space map to continue allocations in a first-fit fashion.
* Once the space map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
/*
* A metaslab is considered "free" if it contains a contiguous
* segment which is greater than metaslab_min_alloc_size.
*/
/*
* Percentage of all cpus that can be used by the metaslab taskq.
*/
/*
* Determines how many txgs a metaslab may remain loaded without having any
* allocations from it. As long as a metaslab continues to be used we will
* keep it loaded.
*/
/*
* Max number of metaslabs per group to preload.
*/
/*
*/
/*
*/
/*
*/
/*
*/
/*
*/
/*
* When using segment-based metaslab selection, we will continue
* allocating from the active metaslab until we have exhausted
* zfs_metaslab_switch_threshold of its buckets.
*/
/*
* facility.
*/
/*
* Maximum entries that the metaslab allocation tracing facility will keep
* in a given list when running in non-debug mode. We limit the number
* of entries in non-debug mode to prevent us from using up too much memory.
* The limit should be sufficiently large that we don't expect any allocation
* to every exceed this value. In debug mode, the system will panic if this
* limit is ever reached allowing for further investigation.
*/
static void metaslab_set_fragmentation(metaslab_t *);
/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
{
return (mc);
}
void
{
}
int
{
/*
* Must hold one of the spa_config locks.
*/
return (0);
do {
return (0);
}
void
{
}
{
}
{
return (mc->mc_deferred);
}
{
}
{
}
void
{
int i;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
KM_SLEEP);
for (int c = 0; c < rvd->vdev_children; c++) {
/*
* Skip any holes, uninitialized top-levels, or
* vdevs that are not in this metalab class.
*/
continue;
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
}
/*
* Calculate the metaslab class's fragmentation metric. The metric
* is weighted based on the space contribution of each metaslab group.
* The return value will be a number between 0 and 100 (inclusive), or
* ZFS_FRAG_INVALID if the metric has not been set. See comment above the
* zfs_frag_table for more information about the metric.
*/
{
for (int c = 0; c < rvd->vdev_children; c++) {
/*
* Skip any holes, uninitialized top-levels, or
* vdevs that are not in this metalab class.
*/
continue;
}
/*
* If a metaslab group does not contain a fragmentation
* metric then just bail out.
*/
return (ZFS_FRAG_INVALID);
}
/*
* Determine how much this metaslab_group is contributing
* to the overall pool fragmentation metric.
*/
}
return (fragmentation);
}
/*
* Calculate the amount of expandable space that is available in
* this metaslab class. If a device is expanded then its expandable
* space will be the amount of allocatable space that is currently not
* part of this metaslab class.
*/
{
for (int c = 0; c < rvd->vdev_children; c++) {
continue;
}
/*
* Calculate if we have enough space to add additional
* metaslabs. We report the expandable space in terms
* of the metaslab size since that's the unit of expansion.
*/
}
return (space);
}
static int
{
return (1);
return (-1);
/*
* If the weights are identical, use the offset to force uniqueness.
*/
return (-1);
return (1);
return (0);
}
/*
* Verify that the space accounting on disk matches the in-core range_trees.
*/
void
{
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
return;
/*
* We can only verify the metaslab space when we're called
* from syncing context with a loaded metaslab that has an allocated
* space map. Calling this in non-syncing context does not
* provide a consistent view of the metaslab since we're performing
* allocations in the future.
*/
return;
/*
* Account for future allocations since we would have already
* deducted that space from the ms_freetree.
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
allocated +=
}
}
/*
* ==========================================================================
* Metaslab groups
* ==========================================================================
*/
/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
* the zfs_mg_noalloc_threshold or has a fragmentation value that is
* greater than zfs_mg_fragmentation_threshold. If a metaslab group
* transitions from allocatable to non-allocatable or vice versa then the
* metaslab group's class is updated to reflect the transition.
*/
static void
{
/*
* If the metaslab group was just added then it won't
* have any space until we finish syncing out this txg.
* At that point we will consider it initialized and available
* for allocations. We also don't consider non-activated
* metaslab groups (e.g. vdevs that are in the middle of being removed)
* to be initialized, because they can't be used for allocation.
*/
}
if (mg->mg_initialized)
/*
* A metaslab group is considered allocatable if it has plenty
* of free space or is not heavily fragmented. We only take
* fragmentation into account if the metaslab group has a valid
* fragmentation metric (i.e. a value between 0 and 100).
*/
/*
* The mc_alloc_groups maintains a count of the number of
* groups in this metaslab class that are still above the
* zfs_mg_noalloc_threshold. This is used by the allocating
* threads to determine if they should avoid allocations to
* a given group. The allocator will avoid allocations to a group
* if that group has reached or is below the zfs_mg_noalloc_threshold
* and there are still other groups that are above the threshold.
* When a group transitions from allocatable to non-allocatable or
* vice versa we update the metaslab class to reflect that change.
* When the mc_alloc_groups value drops to 0 that means that all
* groups have reached the zfs_mg_noalloc_threshold making all groups
* eligible for allocations. This effectively means that all devices
* are balanced again.
*/
mc->mc_alloc_groups--;
mc->mc_alloc_groups++;
}
{
mg->mg_activation_count = 0;
return (mg);
}
void
{
/*
* We may have gone below zero with the activation count
* either because we never activated in the first place or
* because we're done, and possibly removing the vdev.
*/
}
void
{
if (++mg->mg_activation_count <= 0)
return;
} else {
}
}
void
{
if (--mg->mg_activation_count != 0) {
return;
}
} else {
}
}
{
}
{
}
void
{
int i;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
KM_SLEEP);
for (int m = 0; m < vd->vdev_ms_count; m++) {
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
}
static void
{
return;
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
}
}
void
{
return;
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
}
}
static void
{
}
static void
{
}
static void
{
/*
* Although in principle the weight can be any value, in
* practice we do not use values in the range [1, 511].
*/
}
/*
* Calculate the fragmentation for a given metaslab group. We can use
* a simple average here since all metaslabs within the group must have
* the same size. The return value will be a value between 0 and 100
* (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
* group have a fragmentation metric.
*/
{
for (int m = 0; m < vd->vdev_ms_count; m++) {
continue;
valid_ms++;
}
return (ZFS_FRAG_INVALID);
return (fragmentation);
}
/*
* Determine if a given metaslab group should skip allocations. A metaslab
* group should avoid allocations if its free capacity is less than the
* zfs_mg_noalloc_threshold or its fragmentation metric is greater than
* zfs_mg_fragmentation_threshold and there is at least one metaslab group
* that can still handle allocations. If the allocation throttle is enabled
* then we skip allocations to devices that have reached their maximum
* allocation queue depth unless the selected metaslab group is the only
* eligible group remaining.
*/
static boolean_t
{
/*
* We can only consider skipping this metaslab group if it's
* in the normal metaslab class and there are other metaslab
* groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
return (B_TRUE);
/*
* If the metaslab group's mg_allocatable flag is set (see comments
* in metaslab_group_alloc_update() for more information) and
* the allocation throttle is disabled then allow allocations to this
* device. However, if the allocation throttle is enabled then
* check if we have reached our allocation limit (mg_alloc_queue_depth)
* to determine if we should allow allocations to this metaslab group.
* If all metaslab groups are no longer considered allocatable
* (mc_alloc_groups == 0) or we're trying to allocate the smallest
* gang block size then we allow allocations on this metaslab group
* regardless of the mg_allocatable or throttle settings.
*/
if (mg->mg_allocatable) {
if (!mc->mc_alloc_throttle_enabled)
return (B_TRUE);
/*
* If this metaslab group does not have any free space, then
* there is no point in looking further.
*/
if (mg->mg_no_free_space)
return (B_FALSE);
/*
* If this metaslab group is below its qmax or it's
* the only allocatable metasable group, then attempt
* to allocate from it.
*/
return (B_TRUE);
/*
* Since this metaslab group is at or over its qmax, we
* need to determine if there are metaslab groups after this
* one that might be able to handle this allocation. This is
* racy since we can't hold the locks for all metaslab
* groups at the same time when we make this check.
*/
/*
* If there is another metaslab group that
* might be able to handle the allocation, then
* we return false so that we skip this group.
*/
return (B_FALSE);
}
/*
* We didn't find another group to handle the allocation
* so we can't skip this metaslab group even though
* we are at or over our qmax.
*/
return (B_TRUE);
return (B_TRUE);
}
return (B_FALSE);
}
/*
* ==========================================================================
* Range tree callbacks
* ==========================================================================
*/
/*
* Comparison function for the private size-ordered tree. Tree is sorted
* by size, larger sizes at the end of the tree.
*/
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
*/
static void
{
}
/*
* Destroy the block allocator specific components.
*/
static void
{
}
static void
{
}
static void
{
}
static void
{
/*
* Normally one would walk the tree freeing nodes along the way.
* Since the nodes are shared with the range trees we can avoid
* walking all nodes and just reinitialize the avl tree. The nodes
* will be freed by the range tree, so we don't want to free them here.
*/
}
};
/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
*/
/*
* Return the maximum contiguous segment within the metaslab.
*/
{
return (0ULL);
}
static range_seg_t *
{
}
return (rs);
}
/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
*/
static uint64_t
{
return (offset);
}
}
/*
* If we know we've searched the whole map (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0)
return (-1ULL);
*cursor = 0;
}
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static uint64_t
{
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area of the metaslab (i.e. same cursor
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
}
};
/*
* ==========================================================================
* Dynamic block allocator -
* Uses the first fit allocation scheme until space get low and then
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
* ==========================================================================
*/
static uint64_t
{
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area of the metaslab (i.e. same cursor
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
if (max_size < metaslab_df_alloc_threshold ||
t = &msp->ms_size_tree;
*cursor = 0;
}
}
};
/*
* ==========================================================================
* Cursor fit block allocator -
* Select the largest region in the metaslab, set the cursor to the beginning
* of the range and the cursor_end to the end of the range. As allocations
* are made advance the cursor. Continue allocating from the cursor until
* the range is exhausted and then find a new range.
* ==========================================================================
*/
static uint64_t
{
return (-1ULL);
}
return (offset);
}
};
/*
* ==========================================================================
* New dynamic fit allocator -
* Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
* contiguous blocks. If no region is found then just use the largest segment
* that remains.
* ==========================================================================
*/
/*
* Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
* to request from the allocator.
*/
static uint64_t
{
return (-1ULL);
t = &msp->ms_size_tree;
}
}
return (-1ULL);
}
};
/*
* ==========================================================================
* Metaslabs
* ==========================================================================
*/
/*
* Wait for any in-progress metaslab loads to complete.
*/
void
{
while (msp->ms_loading) {
}
}
int
{
int error = 0;
/*
* If the space map has not been allocated yet, then treat
* all the space in the metaslab as free and add it to the
* ms_tree.
*/
else
if (success) {
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
return (error);
}
void
{
msp->ms_max_size = 0;
}
int
metaslab_t **msp)
{
int error;
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
*/
if (object != 0) {
if (error != 0) {
return (error);
}
}
/*
* We create the main range tree here, but we don't create the
* alloctree and freetree until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
* If we're adding space to an existing pool, the new space
* does not become available until after this txg has synced.
* The metaslab's weight will also be initialized when we sync
* out this txg. This ensures that we don't attempt to allocate
* from it before we have initialized it completely.
*/
if (txg <= TXG_INITIAL)
metaslab_sync_done(ms, 0);
/*
* If metaslab_debug_load is set and we're initializing a metaslab
* that has an allocated space map object then load the its space
* map so that can verify frees.
*/
}
if (txg != 0) {
}
return (0);
}
void
{
for (int t = 0; t < TXG_SIZE; t++) {
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
/*
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
* multiplying that by the fragmetation metric in this table. Doing
* this for all buckets and dividing it by the total amount of free
* space in this metaslab (i.e. the total free space in all buckets) gives
* us the fragmentation metric. This means that a high fragmentation metric
* equates to most of the free space being comprised of small segments.
* Conversely, if the metric is low, then most of the free space is in
* large segments. A 10% change in fragmentation equates to approximately
* double the number of segments.
*
* This table defines 0% fragmented space using 16MB segments. Testing has
* shown that segments that are greater than or equal to 16MB do not suffer
* from drastic performance problems. Using this value, we derive the rest
* of the table. Since the fragmentation value is never stored on disk, it
* is possible to change these calculations in the future.
*/
100, /* 512B */
100, /* 1K */
98, /* 2K */
95, /* 4K */
90, /* 8K */
80, /* 16K */
70, /* 32K */
60, /* 64K */
50, /* 128K */
40, /* 256K */
30, /* 512K */
20, /* 1M */
15, /* 2M */
10, /* 4M */
5, /* 8M */
0 /* 16M */
};
/*
* Calclate the metaslab's fragmentation metric. A return value
* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
* not support this metric. Otherwise, the return value should be in the
* range [0, 100].
*/
static void
{
if (!feature_enabled) {
return;
}
/*
* A null space map means that the entire metaslab is free
* and thus is not fragmented.
*/
msp->ms_fragmentation = 0;
return;
}
/*
* If this metaslab's space map has not been upgraded, flag it
* so that we upgrade next time we encounter it.
*/
if (spa_writeable(spa)) {
}
return;
}
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
FRAGMENTATION_TABLE_SIZE - 1);
continue;
}
if (total > 0)
fragmentation /= total;
}
/*
* Compute a weight -- a selection preference value -- for the given metaslab.
* This is based on the amount of free space, the level of fragmentation,
* the LBA range, and whether the metaslab is loaded.
*/
static uint64_t
{
/*
* The baseline weight is the metaslab's free space.
*/
/*
* Use the fragmentation information to inversely scale
* down the baseline weight. We need to ensure that we
* don't exclude this metaslab completely when it's 100%
* fragmented. To avoid this we reduce the fragmented value
* by 1.
*/
/*
* If space < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. The fragmentation metric may have
* decreased the space to something smaller than
* SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
* so that we can consume any remaining space.
*/
}
/*
* Modern disks have uniform bit density and constant angular velocity.
* Therefore, the outer recording zones are faster (higher bandwidth)
* than the inner zones by the ratio of outer to inner track diameter,
* which is typically around 2:1. We account for this by assigning
* higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
}
/*
* If this metaslab is one we're actively using, adjust its
* weight to make it preferable to any inactive metaslab so
* we'll polish it off. If the fragmentation on this metaslab
* has exceed our threshold, then don't mark it active.
*/
}
return (weight);
}
/*
* Return the weight of the specified metaslab, according to the segment-based
* weighting algorithm. The metaslab must be loaded. This function can
* be called within a sync pass since it relies only on the metaslab's
* range tree which is always accurate when the metaslab is loaded.
*/
static uint64_t
{
i--) {
segments <<= 1;
/*
* The range tree provides more precision than the space map
* and must be downgraded so that all values fit within the
* space map's histogram. This allows us to compare loaded
* vs. unloaded metaslabs to determine which metaslab is
* considered "best".
*/
if (i > max_idx)
continue;
if (segments != 0) {
WEIGHT_SET_INDEX(weight, i);
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
}
return (weight);
}
/*
* Calculate the weight based on the on-disk histogram. This should only
* be called after a sync pass has completely finished since the on-disk
* information is updated in metaslab_sync().
*/
static uint64_t
{
for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
WEIGHT_SET_INDEX(weight, i +
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
}
return (weight);
}
/*
* Compute a segment-based weight for the specified metaslab. The weight
* is determined by highest bucket in the histogram. The information
* for the highest bucket is encoded into the weight value.
*/
static uint64_t
{
/*
* The metaslab is completely free.
*/
} else {
}
WEIGHT_SET_ACTIVE(weight, 0);
return (weight);
}
/*
* If the metaslab is fully allocated then just make the weight 0.
*/
return (0);
/*
* If the metaslab is already loaded, then use the range tree to
* determine the weight. Otherwise, we rely on the space map information
* to generate the weight.
*/
} else {
}
/*
* If the metaslab was active the last time we calculated its weight
* then keep it active. We want to consume the entire region that
* is associated with this weight.
*/
return (weight);
}
/*
* Determine if we should attempt to allocate from this metaslab. If the
* metaslab has a maximum size then we can quickly determine if the desired
* allocation size can be satisfied. Otherwise, if we're using segment-based
* weighting then we can determine the maximum allocation that this metaslab
* can accommodate based on the index encoded in the weight. If we're using
* space-based weights then rely on the entire weight (excluding the weight
* type bit).
*/
{
if (msp->ms_max_size != 0)
/*
* The metaslab segment weight indicates segments in the
* range [2^i, 2^(i+1)), where i is the index in the weight.
* Since the asize might be in the middle of the range, we
* should attempt the allocation if asize < 2^(i+1).
*/
should_allocate = (asize <
} else {
should_allocate = (asize <=
}
return (should_allocate);
}
static uint64_t
{
/*
* This vdev is in the process of being removed so there is nothing
* for us to do here.
*/
if (vd->vdev_removing) {
return (0);
}
/*
* Update the maximum size if the metaslab is loaded. This will
* ensure that we get an accurate maximum size if newly freed space
* has been added back into the free tree.
*/
/*
* Segment-based weighting requires space map histogram support.
*/
sizeof (space_map_phys_t))) {
} else {
}
return (weight);
}
static int
{
if (error) {
return (error);
}
}
}
return (0);
}
static void
{
/*
* If size < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
msp->ms_activation_weight = 0;
}
/*
* Segment-based metaslabs are activated once and remain active until
* we either fail an allocation attempt (similar to space-based metaslabs)
* or have exhausted the free space in zfs_metaslab_switch_threshold
* buckets since the metaslab was activated. This function checks to see
* if we've exhaused the zfs_metaslab_switch_threshold buckets in the
* metaslab and passivates it proactively. This will allow us to select a
* metaslabs with larger contiguous region if any remaining within this
* metaslab group. If we're in sync pass > 1, then we continue using this
* metaslab so that we don't dirty more block and cause more sync passes.
*/
void
{
return;
/*
* Since we are in the middle of a sync pass, the most accurate
* information that is accessible to us is the in-core range tree
* histogram; calculate the new weight based on that information.
*/
}
static void
{
(void) metaslab_load(msp);
}
static void
{
int m = 0;
return;
}
/*
* Load the next potential metaslabs
*/
/*
* We preload only the maximum number of metaslabs specified
* by metaslab_preload_limit. If a metaslab is being forced
* to condense then we preload it too. This will ensure
* that force condensing happens in the next txg.
*/
continue;
}
}
}
/*
* Determine if the space map's on-disk footprint is past our tolerance
* for inefficiency. We would like to use the following criteria to make
* our decision:
*
* 1. The size of the space map object should not dramatically increase as a
* result of writing out the free space range tree.
*
* 2. The minimal on-disk space map representation is zfs_condense_pct/100
* times the size than the free space range tree representation
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
*
* 3. The on-disk size of the space map should actually decrease.
*
* Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we
* use the size-ordered range tree in the metaslab and calculate the
* size required to write out the largest segment in our free tree. If the
* size required to represent that segment on disk is larger than the space
* map object then we avoid condensing this map.
*
* To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form.
*
* Unfortunately, we cannot compute the on-disk size of the space map in this
* context because we cannot accurately compute the effects of compression, etc.
* Instead, we apply the heuristic described in the block comment for
* zfs_metaslab_condense_block_threshold - we only condense if the space used
* is greater than a threshold number of blocks.
*/
static boolean_t
{
/*
* Use the ms_size_tree range tree, which is ordered by size, to
* obtain the largest segment in the free tree. We always condense
* metaslabs that are empty and metaslabs for which a condense
* request has been made.
*/
return (B_TRUE);
/*
* Calculate the number of 64-bit entries this segment would
* require when written to disk. If this single segment would be
* larger on-disk than the entire current on-disk structure, then
* clearly condensing will increase the on-disk structure size.
*/
return (segsz <= object_size &&
}
/*
* Condense the on-disk space map representation to its minimized form.
* The minimized form consists of a small number of allocations followed by
* the entries of the free range tree.
*/
static void
{
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
/*
* Create an range tree that is 100% allocated. We remove segments
* that have been freed in this txg, any deferred frees that exist,
* and any allocation in the future. Removing segments should be
* a relatively inexpensive operation since we expect these trees to
* have a small number of nodes.
*/
/*
* Remove what's been freed in this txg from the condense_tree.
* Since we're in sync_pass 1, we know that all the frees from
* this txg are in the freetree.
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
}
/*
* We're about to drop the metaslab's lock thus allowing
* other consumers to change it's content. Set the
* metaslab's ms_condensing flag to ensure that
* allocations on this metaslab do not occur while we're
* in the middle of committing it to disk. This is only critical
* for the ms_tree as all other range trees use per txg
* views of their content.
*/
/*
* While we would ideally like to create a space map representation
* that consists only of allocation records, doing so can be
* prohibitively expensive because the in-core free tree can be
* large, and therefore computationally expensive to subtract
* from the condense_tree. Instead we sync out two trees, a cheap
* allocation only tree followed by the in-core free tree. While not
* optimal, this is typically close to optimal, and much cheaper to
* compute.
*/
}
/*
* Write a metaslab to disk in the context of the specified transaction group.
*/
void
{
/*
* This metaslab has just been added so there's no work to do now.
*/
return;
}
/*
* Normally, we don't want to process a metaslab if there
* are no allocations or frees to perform. However, if the metaslab
* is being forced to condense we need to let it through.
*/
if (range_tree_space(alloctree) == 0 &&
range_tree_space(*freetree) == 0 &&
return;
/*
* The only state that can actually be changing concurrently with
* metaslab_sync() is the metaslab's ms_tree. No other thread can
* be modifying this txg's alloctree, freetree, freed_tree, or
* space_map_phys_t. Therefore, we only hold ms_lock to satify
* space map ASSERTs. We drop it whenever we call into the DMU,
* because the DMU can call down to us (e.g. via zio_free()) at
* any time.
*/
VERIFY3U(new_object, !=, 0);
}
/*
* Note: metaslab_condense() clears the space map's histogram.
* Therefore we must verify and remove this histogram before
* condensing.
*/
} else {
}
/*
* When the space map is loaded, we have an accruate
* histogram in the range tree. This gives us an opportunity
* to bring the space map's histogram up-to-date so we clear
* it first before updating it.
*/
/*
* Since we've cleared the histogram we need to add back
* any free space that has already been processed, plus
* any deferred space. This allows the on-disk histogram
* to accurately reflect all free space even if some space
* is not yet available for allocation (i.e. deferred).
*/
/*
* Add back any deferred free space that has not been
* added back into the in-core free tree yet. This will
* ensure that we don't end up with a space map histogram
* that is completely empty unless the metaslab is fully
* allocated.
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
/*
* Always add the free space from this sync pass to the space
* map histogram. We want to make sure that the on-disk histogram
* accounts for all free space. If the space map is not loaded,
* then we will lose some accuracy but will correct it the next
* time we load the space map.
*/
/*
* For sync pass 1, we avoid traversing this txg's free range tree
* and instead will just swap the pointers for freetree and
* freed_tree. We can safely do this since the freed_tree is
* guaranteed to be empty on the initial pass.
*/
} else {
}
}
}
/*
* Called after a transaction group has completely synced to mark
* all of the metaslab's free space as usable.
*/
void
{
/*
* If this metaslab is just becoming available, initialize its
* alloctrees, freetrees, and defertree and add its capacity to
* the vdev.
*/
for (int t = 0; t < TXG_SIZE; t++) {
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
}
}
}
defer_delta = 0;
if (defer_allowed) {
} else {
}
/*
* If there's a metaslab_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
*/
/*
* Move the frees from the defer_tree back to the free
* range tree (if it's loaded). Swap the freed_tree and the
* defer_tree -- this is safe to do because we've just emptied out
* the defer_tree.
*/
if (defer_allowed) {
} else {
}
if (msp->ms_deferspace != 0) {
/*
* Keep syncing this metaslab until all deferred frees
* are back in circulation.
*/
}
/*
* Calculate the new weights before unloading any metaslabs.
* This will give us the most accurate weighting.
*/
/*
* If the metaslab is loaded and we've not tried to load or allocate
* from it in 'metaslab_unload_delay' txgs, then unload it.
*/
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
}
if (!metaslab_debug_unload)
}
}
void
{
/*
* Preload the next potential metaslabs
*/
}
static uint64_t
{
return (1ULL << 63);
return (0);
}
/*
* ==========================================================================
* Metaslab allocation tracing facility
* ==========================================================================
*/
void
{
"metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
if (metaslab_trace_ksp != NULL) {
"metaslab_trace_over_limit", KSTAT_DATA_UINT64);
}
}
void
{
if (metaslab_trace_ksp != NULL) {
}
}
/*
* Add an allocation trace element to the allocation tracing list.
*/
static void
{
if (!metaslab_trace_enabled)
return;
/*
* When the tracing list reaches its maximum we remove
* the second element in the list before adding a new one.
* By removing the second element we preserve the original
* entry as a clue to what allocations steps have already been
* performed.
*/
#ifdef DEBUG
panic("too many entries in allocation list");
#endif
}
mat->mat_weight = 0;
/*
* The list is part of the zio so locking is not required. Only
* a single thread will perform allocations for a given zio.
*/
}
void
{
}
void
{
}
/*
* ==========================================================================
* Metaslab block operations
* ==========================================================================
*/
static void
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
return;
return;
}
void
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
return;
return;
}
void
{
#ifdef ZFS_DEBUG
for (int d = 0; d < ndvas; d++) {
}
#endif
}
static uint64_t
{
if (start != -1ULL) {
/* Track the last successful allocation */
}
/*
* Now that we've attempted the allocation we need to update the
* metaslab's maximum block size since it may have changed.
*/
return (start);
}
static uint64_t
{
int i;
for (i = 0; i < d; i++) {
break;
}
}
for (;;) {
/*
* Find the metaslab with the highest weight that is less
* than what we've already tried. In the common case, this
* means that we will examine each metaslab at most once.
* Note that concurrent callers could reorder metaslabs
* by activation/passivation once we have dropped the mg_lock.
* If a metaslab is activated by another thread, and we fail
* to allocate from the metaslab we have selected, we may
* not try the newly-activated metaslab, and instead activate
* another metaslab. This is not optimal, but generally
* does not cause any problems (a possible exception being
* if every metaslab is completely full except for the
* the newly-activated metaslab which we fail to examine).
*/
continue;
}
/*
* If the selected metaslab is condensing, skip it.
*/
if (msp->ms_condensing)
continue;
break;
min_distance >> 1);
for (i = 0; i < d; i++) {
break;
}
if (i == d)
break;
}
return (-1ULL);
}
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock. We check the
* active status first to see if we need to reselect
* a new metaslab.
*/
continue;
}
continue;
}
continue;
}
/*
* Now that we have the lock, recheck to see if we should
* continue to use this metaslab for this allocation. The
* the metaslab is now loaded so metaslab_should_allocate() can
* accurately determine if the allocation attempt should
* proceed.
*/
/* Passivate this metaslab and select a new one. */
goto next;
}
/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
* to disk.
*/
if (msp->ms_condensing) {
continue;
}
if (offset != -1ULL) {
/* Proactively passivate the metaslab, if needed */
break;
}
next:
/*
* We were unable to allocate from this metaslab so determine
* a new weight for this metaslab. Now that we have loaded
* the metaslab we can provide a better hint to the metaslab
* selector.
*
* For space-based metaslabs, we use the maximum block size.
* This information is only available when the metaslab
* is loaded and is more accurate than the generic free
* space weight that was calculated by metaslab_weight().
* This information allows us to quickly compare the maximum
* available allocation in the metaslab to the allocation
* size being requested.
*
* For segment-based metaslabs, determine the new weight
* based on the highest bucket in the range tree. We
* explicitly use the loaded segment weight (i.e. the range
* tree histogram) since it contains the space that is
* currently available for allocation and is accurate
* even within a sync pass.
*/
} else {
}
/*
* We have just failed an allocation attempt, check
* that metaslab_should_allocate() agrees. Otherwise,
* we may end up in an infinite loop retrying the same
* metaslab.
*/
}
return (offset);
}
static uint64_t
{
min_distance, dva, d);
if (offset == -1ULL) {
if (asize == SPA_GANGBLOCKSIZE) {
/*
* This metaslab group was unable to allocate
* the minimum gang block size so it must be out of
* space. We must notify the allocation throttle
* to start skipping allocation attempts to this
* metaslab group until more space becomes available.
* Note: this failure cannot be caused by the
* allocation throttle since the allocation throttle
* is only responsible for skipping devices and
* not failing block allocations.
*/
}
}
mg->mg_allocations++;
return (offset);
}
/*
* If we have to write a ditto block (i.e. more than one DVA for a given BP)
* on the same vdev as an existing DVA of this BP, then try to allocate it
* at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
* existing DVAs.
*/
/*
* Allocate a block for the specified i/o.
*/
static int
{
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
}
/*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
* If we are doing ditto or log blocks, try to spread them across
* consecutive vdevs. If we're forced to reuse a vdev before we've
* allocated all of our ditto blocks, then try and spread them out on
* that vdev as much as possible. If it turns out to not be possible,
* gradually lower our standards until anything becomes acceptable.
* Also, allocating on consecutive vdevs (as opposed to random vdevs)
* gives us hope of containing our fault domains to something we're
* able to reason about. Otherwise, any two top-level vdev failures
* will guarantee the loss of data. With consecutive allocation,
* only two adjacent top-level vdev failures will result in data loss.
*
* If we are doing gang blocks (hintdva is non-NULL), try to keep
* ourselves on the same vdev as our gang block header. That
* way, we can hope for locality in vdev_cache, plus it makes our
* fault domains something tractable.
*/
if (hintdva) {
/*
* It's possible the vdev we're using as the hint no
* longer exists (i.e. removed). Consult the rotor when
* all else fails.
*/
if (flags & METASLAB_HINTBP_AVOID &&
} else {
}
} else if (d != 0) {
} else {
}
/*
* If the hint put us into the wrong metaslab class, or into a
* metaslab group that has been passivated, just follow the rotor.
*/
top:
do {
/*
* Don't allocate from faulted devices.
*/
if (try_hard) {
} else {
}
/*
* Determine if the selected metaslab group is eligible
* for allocations. If we're ganging then don't allow
* this metaslab group to skip allocations since that would
* inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
psize);
}
if (!allocatable) {
goto next;
}
/*
* Avoid writing single-copy data to a failing,
* non-redundant vdev, unless we've already tried all
* other vdevs.
*/
goto next;
}
/*
* If we don't need to try hard, then require that the
* block be 1/8th of the device away from any other DVAs
* in this BP. If we are trying hard, allow any offset
* to be used (distance=0).
*/
if (!try_hard) {
distance = 0;
}
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
* figure out whether the corresponding vdev is
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
/*
* Calculate how much more or less we should
* try to allocate from this device during
* this iteration around the rotor.
* For example, if a device is 80% full
* and the pool is 20% full then we should
* reduce allocations by 60% on this device.
*
* mg_bias = (20 - 80) * 512K / 100 = -307K
*
* This reduces allocations by 307K for this
* iteration.
*/
} else if (!metaslab_bias_enabled) {
}
mc->mc_aliquot = 0;
}
return (0);
}
next:
mc->mc_aliquot = 0;
/*
* If we haven't tried hard, do so now.
*/
if (!try_hard) {
goto top;
}
}
/*
* Free the block represented by DVA in the context of the specified
* transaction group.
*/
static void
{
return;
ASSERT(0);
return;
}
if (DVA_GET_GANG(dva))
if (now) {
} else {
}
}
/*
* Intent log support: upon opening the pool after a crash, notify the SPA
* of blocks that the intent log has allocated for immediate write, but
* which are still considered free by the SPA because the last transaction
* group didn't commit yet.
*/
static int
{
int error = 0;
if (DVA_GET_GANG(dva))
return (error);
}
}
return (0);
}
/*
* Reserve some allocation slots. The reservation system must be called
* before we call into the allocator. If there aren't any available slots
* then the I/O will be throttled until an I/O completes and its slots are
* freed up. The function returns true if it was successful in placing
* the reservation.
*/
int flags)
{
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
for (int d = 0; d < slots; d++) {
}
}
return (slot_reserved);
}
void
{
for (int d = 0; d < slots; d++) {
}
}
int
{
int error = 0;
}
for (int d = 0; d < ndvas; d++) {
if (error != 0) {
for (d--; d >= 0; d--) {
}
return (error);
} else {
/*
* Update the metaslab group's queue depth
* based on the newly allocated dva.
*/
}
}
return (0);
}
void
{
for (int d = 0; d < ndvas; d++)
}
int
{
int error = 0;
if (txg != 0) {
/*
* First do a dry run to make sure all DVAs are claimable,
* so we don't have to unwind from partial failures below.
*/
return (error);
}
for (int d = 0; d < ndvas; d++)
break;
return (error);
}
void
{
if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
return;
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
for (int j = 0; j < TXG_SIZE; j++)
for (int j = 0; j < TXG_DEFER_SIZE; j++)
}
}