metaslab.c revision ea8dc4b6d2251b437950c0056bc626b311c73c27
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER START
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The contents of this file are subject to the terms of the
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock * Common Development and Distribution License (the "License").
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock * You may not use this file except in compliance with the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
fa9e4066f08beec538e775443c5be79dd423fcabahrens * See the License for the specific language governing permissions
fa9e4066f08beec538e775443c5be79dd423fcabahrens * and limitations under the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * When distributing Covered Code, include this CDDL HEADER in each
fa9e4066f08beec538e775443c5be79dd423fcabahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If applicable, add the following below this CDDL HEADER, with the
fa9e4066f08beec538e775443c5be79dd423fcabahrens * fields enclosed by brackets "[]" replaced with your own identifying
fa9e4066f08beec538e775443c5be79dd423fcabahrens * information: Portions Copyright [yyyy] [name of copyright owner]
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER END
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Use is subject to license terms.
fa9e4066f08beec538e775443c5be79dd423fcabahrens#pragma ident "%Z%%M% %I% %E% SMI"
fa9e4066f08beec538e775443c5be79dd423fcabahrens * ==========================================================================
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Metaslab classes
fa9e4066f08beec538e775443c5be79dd423fcabahrens * ==========================================================================
fa9e4066f08beec538e775443c5be79dd423fcabahrens mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * ==========================================================================
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Metaslab groups
fa9e4066f08beec538e775443c5be79dd423fcabahrens * ==========================================================================
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (-1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If the weights are identical, use the offset to force uniqueness.
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (-1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
fa9e4066f08beec538e775443c5be79dd423fcabahrens mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
fa9e4066f08beec538e775443c5be79dd423fcabahrens sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * ==========================================================================
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Metaslabs
fa9e4066f08beec538e775443c5be79dd423fcabahrens * ==========================================================================
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp,
fa9e4066f08beec538e775443c5be79dd423fcabahrens space_map_create(&msp->ms_map, start, size, vd->vdev_ashift,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If we're opening an existing pool (txg == 0) or creating
fa9e4066f08beec538e775443c5be79dd423fcabahrens * a new one (txg == TXG_INITIAL), all space is available now.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If we're adding space to an existing pool, the new space
fa9e4066f08beec538e775443c5be79dd423fcabahrens * does not become available until after this txg has synced.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * We enforce this by assigning an initial weight of 0 to new space.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * (Transactional allocations for this txg would actually be OK;
fa9e4066f08beec538e775443c5be79dd423fcabahrens * it's intent log allocations that cause trouble. If we wrote
fa9e4066f08beec538e775443c5be79dd423fcabahrens * a log block in this txg and lost power, the log replay would be
fa9e4066f08beec538e775443c5be79dd423fcabahrens * based on the DVA translations that had been synced in txg - 1.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Those translations would not include this metaslab's vdev.)
fa9e4066f08beec538e775443c5be79dd423fcabahrens metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (txg == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens * We're opening the pool. Make the metaslab's
fa9e4066f08beec538e775443c5be79dd423fcabahrens * free space available immediately.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * We're adding a new metaslab to an already-open pool.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Declare all of the metaslab's space to be free.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Note that older transaction groups cannot allocate
fa9e4066f08beec538e775443c5be79dd423fcabahrens * from this metaslab until its existence is committed,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * because we set ms_last_alloc to the current txg.
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX -- we'll need a call to picker_init here */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX -- we'll need a call to picker_fini here */
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Write a metaslab to disk in the context of the specified transaction group.
fa9e4066f08beec538e775443c5be79dd423fcabahrens space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
fa9e4066f08beec538e775443c5be79dd423fcabahrens space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
fa9e4066f08beec538e775443c5be79dd423fcabahrens space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
fa9e4066f08beec538e775443c5be79dd423fcabahrens dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start);
fa9e4066f08beec538e775443c5be79dd423fcabahrens alloc_delta = allocmap->sm_space - freemap->sm_space;
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 &&
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Write out the current state of the allocation
fa9e4066f08beec538e775443c5be79dd423fcabahrens * world. The current metaslab is full, minus
fa9e4066f08beec538e775443c5be79dd423fcabahrens * stuff that's been freed this txg (freed_map),
fa9e4066f08beec538e775443c5be79dd423fcabahrens * minus allocations from txgs in the future.
fa9e4066f08beec538e775443c5be79dd423fcabahrens space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx);
ea8dc4b6d2251b437950c0056bc626b311c73c27eschrock VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db));
fa9e4066f08beec538e775443c5be79dd423fcabahrens (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Called after a transaction group has completely synced to mark
fa9e4066f08beec538e775443c5be79dd423fcabahrens * all of the metaslab's free space as usable.
fa9e4066f08beec538e775443c5be79dd423fcabahrens vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc;
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (txg != 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX -- we'll need a call to picker_fini here */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* If we're empty, don't bother sticking around */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* Add the freed blocks to the available space map */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* Safe to use for allocation now */
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The first-fit block picker. No picker_init or picker_fini,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * this is just an experiment to see how it feels to separate out
fa9e4066f08beec538e775443c5be79dd423fcabahrens * the block selection policy from the map updates.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Note: the 'cursor' argument is a form of PPD.
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor)
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* If we couldn't find a block after cursor, search again */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (-1ULL);
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg)
fa9e4066f08beec538e775443c5be79dd423fcabahrens &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]);
fa9e4066f08beec538e775443c5be79dd423fcabahrens space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Intent log support: upon opening the pool after a crash, notify the SPA
fa9e4066f08beec538e775443c5be79dd423fcabahrens * of blocks that the intent log has allocated for immediate write, but
fa9e4066f08beec538e775443c5be79dd423fcabahrens * which are still considered free by the SPA because the last transaction
fa9e4066f08beec538e775443c5be79dd423fcabahrens * group didn't commit yet.
fa9e4066f08beec538e775443c5be79dd423fcabahrens if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
fa9e4066f08beec538e775443c5be79dd423fcabahrens msp->ms_usable_end, sm->sm_size - msp->ms_usable_space);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX -- we'll need a call to picker_init here */
fa9e4066f08beec538e775443c5be79dd423fcabahrens bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
fa9e4066f08beec538e775443c5be79dd423fcabahrens space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Enforce segregation across transaction groups.
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX -- We should probably not assume we know what ms_weight means */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (msp->ms_map.sm_space >= size && msp->ms_weight >= size);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (msp->ms_map.sm_space >= size && msp->ms_weight >= size)
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX -- the weight test should be in terms of MINFREE */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (msp->ms_usable_space >= size && msp->ms_weight >= size);
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg)
fa9e4066f08beec538e775443c5be79dd423fcabahrens for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp))
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size,
fa9e4066f08beec538e775443c5be79dd423fcabahrens while ((msp = metaslab_pick(mg, size, txg)) != NULL) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX -- we'll need a call to picker_init here */
fa9e4066f08beec538e775443c5be79dd423fcabahrens bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
fa9e4066f08beec538e775443c5be79dd423fcabahrens if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Allocate a block for the specified i/o.
fa9e4066f08beec538e775443c5be79dd423fcabahrensmetaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Start at the rotor and loop through all mgs until we find something.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Note that there's no locking on mc_rotor or mc_allocated because
fa9e4066f08beec538e775443c5be79dd423fcabahrens * nothing actually breaks if we miss a few updates -- we just won't
fa9e4066f08beec538e775443c5be79dd423fcabahrens * allocate quite as evenly. It all balances out over time.
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens msp = metaslab_group_alloc(spa, mg, asize, &offset, txg);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If we've just selected this metaslab group,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * figure out whether the corresponding vdev is
fa9e4066f08beec538e775443c5be79dd423fcabahrens * over- or under-used relative to the pool,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * and set an allocation bias to even it out.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Determine percent used in units of 0..1024.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * (This is just to avoid floating point.)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Bias by at most +/- 25% of the aliquot.
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Free the block represented by DVA in the context of the specified
fa9e4066f08beec538e775443c5be79dd423fcabahrens * transaction group.
fa9e4066f08beec538e775443c5be79dd423fcabahrens if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) {