metaslab.c revision a31e67878a1bf006016a43cafa1fdffa37e432e6
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
metaslab_class_create(void)
{
return (mc);
}
void
{
}
}
void
{
} else {
}
}
void
{
} else {
}
}
/*
* ==========================================================================
* Metaslab groups
* ==========================================================================
*/
static int
{
return (1);
return (-1);
/*
* If the weights are identical, use the offset to force uniqueness.
*/
return (-1);
return (1);
return (0);
}
{
return (mg);
}
void
{
}
static void
{
}
static void
{
}
static void
{
/*
* Although in principle the weight can be any value, in
* practice we do not use values in the range [1, 510].
*/
}
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static void
{
}
static void
{
}
static uint64_t
{
return (offset);
}
}
/*
* If we know we've searched the whole map (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0)
return (-1ULL);
*cursor = 0;
}
/* ARGSUSED */
static void
{
/* No need to update cursor */
}
/* ARGSUSED */
static void
{
/* No need to update cursor */
}
static space_map_ops_t metaslab_ff_ops = {
};
/*
* ==========================================================================
* Metaslabs
* ==========================================================================
*/
{
/*
* We create the main space map here, but we don't create the
* allocmaps and freemaps until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
* If we're adding space to an existing pool, the new space
* does not become available until after this txg has synced.
*/
if (txg <= TXG_INITIAL)
metaslab_sync_done(msp, 0);
if (txg != 0) {
/*
* The vdev is dirty, but the metaslab isn't -- it just needs
* to have metaslab_sync_done() invoked from vdev_sync_done().
* [We could just dirty the metaslab, but that would cause us
* to allocate a space map object for it, which is wasteful
* and would mess up the locality logic in metaslab_weight().]
*/
}
return (msp);
}
void
{
int t;
for (t = 0; t < TXG_SIZE; t++) {
}
}
#define METASLAB_ACTIVE_MASK \
#define METASLAB_SMO_BONUS_MULTIPLIER 2
static uint64_t
{
/*
* The baseline weight is the metaslab's free space.
*/
/*
* Modern disks have uniform bit density and constant angular velocity.
* Therefore, the outer recording zones are faster (higher bandwidth)
* than the inner zones by the ratio of outer to inner track diameter,
* which is typically around 2:1. We account for this by assigning
* higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
/*
* For locality, assign higher weight to metaslabs we've used before.
*/
if (smo->smo_object != 0)
/*
* If this metaslab is one we're actively using, adjust its weight to
* make it preferable to any inactive metaslab so we'll polish it off.
*/
return (weight);
}
static int
{
if (error) {
return (error);
}
}
return (0);
}
static void
{
/*
* If size < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
}
/*
* Write a metaslab to disk in the context of the specified transaction group.
*/
void
{
int t;
/*
* The only state that can actually be changing concurrently with
* metaslab_sync() is the metaslab's ms_map. No other thread can
* be modifying this txg's allocmap, freemap, freed_map, or smo.
* Therefore, we only hold ms_lock to satify space_map ASSERTs.
* We drop it whenever we call into the DMU, because the DMU
* can call down to us (e.g. via zio_free()) at any time.
*/
if (smo->smo_object == 0) {
}
/*
* The in-core space map representation is twice as compact
* as the on-disk one, so it's time to condense the latter
* by generating a pure allocmap from first principles.
*
* This metaslab is 100% allocated,
* minus the content of the in-core map (sm),
* minus what's been freed this txg (freed_map),
* minus allocations from txgs in the future
* (because they haven't been committed yet).
*/
for (t = 1; t < TXG_CONCURRENT_STATES; t++)
}
}
/*
* Called after a transaction group has completely synced to mark
* all of the metaslab's free space as usable.
*/
void
{
int t;
/*
* If this metaslab is just becoming available, initialize its
* allocmaps and freemaps and add its capacity to the vdev.
*/
for (t = 0; t < TXG_SIZE; t++) {
}
}
/*
* If there's a space_map_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
* Then, add everything we freed in this txg to the map.
*/
/*
* If the map is loaded but no longer active, evict it as soon as all
* future allocations have synced. (If we unloaded it now and then
* loaded a moment later, the map wouldn't reflect those allocations.)
*/
int evictable = 1;
for (t = 1; t < TXG_CONCURRENT_STATES; t++)
evictable = 0;
if (evictable)
}
}
static uint64_t
{
return (1ULL << 63);
return (0);
}
static uint64_t
{
int i;
for (i = 0; i < d; i++)
for (;;) {
return (-1ULL);
}
break;
for (i = 0; i < d; i++)
break;
if (i == d)
break;
}
return (-1ULL);
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
continue;
}
continue;
}
continue;
}
break;
}
return (offset);
}
/*
* Allocate a block for the specified i/o.
*/
static int
{
int dshift = 3;
int all_zero;
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
return (ENOSPC);
/*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_allocated because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
* If we are doing ditto or log blocks, try to spread them across
* consecutive vdevs. If we're forced to reuse a vdev before we've
* allocated all of our ditto blocks, then try and spread them out on
* that vdev as much as possible. If it turns out to not be possible,
* gradually lower our standards until anything becomes acceptable.
* Also, allocating on consecutive vdevs (as opposed to random vdevs)
* gives us hope of containing our fault domains to something we're
* able to reason about. Otherwise, any two top-level vdev failures
* will guarantee the loss of data. With consecutive allocation,
* only two adjacent top-level vdev failures will result in data loss.
*
* If we are doing gang blocks (hintdva is non-NULL), try to keep
* ourselves on the same vdev as our gang block header. That
* way, we can hope for locality in vdev_cache, plus it makes our
* fault domains something tractable.
*/
if (hintdva) {
if (flags & METASLAB_HINTBP_AVOID)
else
} else if (d != 0) {
} else {
}
/*
* If the hint put us into the wrong class, just follow the rotor.
*/
top:
do {
/*
* Don't allocate from faulted devices.
*/
if (!vdev_allocatable(vd))
goto next;
/*
* Avoid writing single-copy data to a failing vdev
*/
d == 0 && dshift == 3) {
goto next;
}
distance = 0;
else
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
* figure out whether the corresponding vdev is
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
if (mc->mc_allocated == 0) {
/*
* Determine percent used in units of 0..1024.
* (This is just to avoid floating point.)
*/
/*
* Bias by at most +/- 25% of the aliquot.
*/
}
mc->mc_allocated = 0;
}
return (0);
}
next:
mc->mc_allocated = 0;
if (!all_zero) {
dshift++;
goto top;
}
return (ENOSPC);
}
/*
* Free the block represented by DVA in the context of the specified
* transaction group.
*/
static void
{
return;
ASSERT(0);
return;
}
if (DVA_GET_GANG(dva))
if (now) {
} else {
}
}
/*
* Intent log support: upon opening the pool after a crash, notify the SPA
* of blocks that the intent log has allocated for immediate write, but
* which are still considered free by the SPA because the last transaction
* group didn't commit yet.
*/
static int
{
int error;
return (ENXIO);
if (DVA_GET_GANG(dva))
return (error);
}
}
return (0);
}
int
{
int error = 0;
return (ENOSPC);
}
for (int d = 0; d < ndvas; d++) {
if (error) {
for (d--; d >= 0; d--) {
}
return (error);
}
}
return (0);
}
void
{
for (int d = 0; d < ndvas; d++)
}
int
{
int error = 0;
if (txg != 0) {
/*
* First do a dry run to make sure all DVAs are claimable,
* so we don't have to unwind from partial failures below.
*/
return (error);
}
for (int d = 0; d < ndvas; d++)
break;
return (error);
}