zap.c revision 87e5029a3226958edab1512d6182bc74d8d80c9a
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER START
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The contents of this file are subject to the terms of the
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Common Development and Distribution License, Version 1.0 only
fa9e4066f08beec538e775443c5be79dd423fcabahrens * (the "License"). You may not use this file except in compliance
fa9e4066f08beec538e775443c5be79dd423fcabahrens * with the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
fa9e4066f08beec538e775443c5be79dd423fcabahrens * See the License for the specific language governing permissions
fa9e4066f08beec538e775443c5be79dd423fcabahrens * and limitations under the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * When distributing Covered Code, include this CDDL HEADER in each
fa9e4066f08beec538e775443c5be79dd423fcabahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If applicable, add the following below this CDDL HEADER, with the
fa9e4066f08beec538e775443c5be79dd423fcabahrens * fields enclosed by brackets "[]" replaced with your own identifying
fa9e4066f08beec538e775443c5be79dd423fcabahrens * information: Portions Copyright [yyyy] [name of copyright owner]
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER END
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Use is subject to license terms.
fa9e4066f08beec538e775443c5be79dd423fcabahrens#pragma ident "%Z%%M% %I% %E% SMI"
fa9e4066f08beec538e775443c5be79dd423fcabahrens * This file contains the top half of the zfs directory structure
fa9e4066f08beec538e775443c5be79dd423fcabahrens * implementation. The bottom half is in zap_leaf.c.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The zdir is an extendable hash data structure. There is a table of
fa9e4066f08beec538e775443c5be79dd423fcabahrens * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
fa9e4066f08beec538e775443c5be79dd423fcabahrens * each a constant size and hold a variable number of directory entries.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The pointer table holds a power of 2 number of pointers.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
fa9e4066f08beec538e775443c5be79dd423fcabahrens * by the pointer at index i in the table holds entries whose hash value
fa9e4066f08beec538e775443c5be79dd423fcabahrens * has a zd_prefix_len - bit prefix
fa9e4066f08beec538e775443c5be79dd423fcabahrensstatic void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
fa9e4066f08beec538e775443c5be79dd423fcabahrensstatic int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
fa9e4066f08beec538e775443c5be79dd423fcabahrensstatic zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
fa9e4066f08beec538e775443c5be79dd423fcabahrensstatic void zap_leaf_pageout(dmu_buf_t *db, void *vl);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* it's a ptrtbl block */
fa9e4066f08beec538e775443c5be79dd423fcabahrens mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * explicitly zero it since it might be coming from an
fa9e4066f08beec538e775443c5be79dd423fcabahrens * initialized microzap
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(sizeof (zap_phys_t), ==, zap->zap_dbuf->db_size);
fa9e4066f08beec538e775443c5be79dd423fcabahrens zp->zap_freeblk = 2; /* block 1 will be the first leaf */
fa9e4066f08beec538e775443c5be79dd423fcabahrens zp->zap_leafs[i] = 1; /* block 1 will be the first leaf */
fa9e4066f08beec538e775443c5be79dd423fcabahrens * set up block 1 - the first leaf
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Generic routines for dealing with the pointer & cookie tables.
fa9e4066f08beec538e775443c5be79dd423fcabahrens void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* hepb = half the number of entries in a block */
fa9e4066f08beec538e775443c5be79dd423fcabahrens newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Copy the ptrtbl from the old to new location, leaving the odd
fa9e4066f08beec538e775443c5be79dd423fcabahrens * entries blank as we go.
fa9e4066f08beec538e775443c5be79dd423fcabahrens db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* first half of entries in old[b] go to new[2*b+0] */
fa9e4066f08beec538e775443c5be79dd423fcabahrens db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens transfer_func(db_old->db_data, db_new->db_data, hepb);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* second half of entries in old[b] go to new[2*b+1] */
fa9e4066f08beec538e775443c5be79dd423fcabahrens db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens dprintf("finished; numblocks now %llu (%lluk entries)\n",
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Routines for growing the ptrtbl.
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
fa9e4066f08beec538e775443c5be79dd423fcabahrens for (i = 0; i < n; i++) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The ptrtbl can no longer be contained in the
fa9e4066f08beec538e775443c5be79dd423fcabahrens * header block. Give it its own entire block, which
fa9e4066f08beec538e775443c5be79dd423fcabahrens * will quadruple the size of the ptrtbl.
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) -
fa9e4066f08beec538e775443c5be79dd423fcabahrens * This function doesn't increment zap_num_leafs because it's used to
fa9e4066f08beec538e775443c5be79dd423fcabahrens * allocate a leaf chain, which doesn't count against zap_num_leafs.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The directory must be held exclusively for this tx.
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* hence we already dirtied zap->zap_dbuf */
fa9e4066f08beec538e775443c5be79dd423fcabahrens l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens/* ARGSUSED */
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX there are still holds on this block, so we can't free it? */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* dmu_free_range(zap->zap_objset, zap->zap_object, */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* offset, 1<<ZAP_BLOCK_SHIFT, tx); */
fa9e4066f08beec538e775443c5be79dd423fcabahrens mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Routines for obtaining zap_leaf_t's
fa9e4066f08beec538e775443c5be79dd423fcabahrens winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* someone else set it first */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(db->db_offset, ==, blkid << ZAP_BLOCK_SHIFT);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Must lock before dirtying, otherwise l->l_phys could change,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * causing ASSERT below to fail.
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
fa9e4066f08beec538e775443c5be79dd423fcabahrens nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens (void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
fa9e4066f08beec538e775443c5be79dd423fcabahrens idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
fa9e4066f08beec538e775443c5be79dd423fcabahrens l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* failed to upgrade */
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* it split while our locks were down */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* There's only one pointer to us. Chain on another leaf blk. */
fa9e4066f08beec538e775443c5be79dd423fcabahrens (void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* There's more than one pointer to us. Split this leaf. */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* set sibling pointers */
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
fa9e4066f08beec538e775443c5be79dd423fcabahrens sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* we want the sibling */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* this leaf will soon make us grow the pointer table */
fa9e4066f08beec538e775443c5be79dd423fcabahrensfzap_checksize(uint64_t integer_size, uint64_t num_integers)
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* Only integer sizes supported by C */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* Make sure we won't overflow */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Routines for maniplulating attributes.
fa9e4066f08beec538e775443c5be79dd423fcabahrens uint64_t integer_size, uint64_t num_integers, void *buf)
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_entry_read(&zeh, integer_size, num_integers, buf);
fa9e4066f08beec538e775443c5be79dd423fcabahrens const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT(fzap_checksize(integer_size, num_integers) == 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX If this leaf is chained, split it if we can. */
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = fzap_add_cd(zap, name, integer_size, num_integers,
fa9e4066f08beec538e775443c5be79dd423fcabahrens int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX If this leaf is chained, split it if we can. */
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_entry_update(&zeh, integer_size, num_integers, val);
fa9e4066f08beec538e775443c5be79dd423fcabahrensfzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Routines for iterating over the attributes.
fa9e4066f08beec538e775443c5be79dd423fcabahrensfzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* retrieve the next entry at or after zc_hash/zc_cd */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* if no entry, return ENOENT */
87e5029a3226958edab1512d6182bc74d8d80c9aahrens (ZAP_HASH_IDX(zc->zc_hash, zc->zc_leaf->lh_prefix_len) !=
87e5029a3226958edab1512d6182bc74d8d80c9aahrens zc->zc_leaf = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
fa9e4066f08beec538e775443c5be79dd423fcabahrens uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1;
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * NB: if a leaf has more pointers than an entire ptrtbl block
fa9e4066f08beec538e775443c5be79dd423fcabahrens * can hold, then it'll be accounted for more than once, since
fa9e4066f08beec538e775443c5be79dd423fcabahrens * we won't have lastblk.
fa9e4066f08beec538e775443c5be79dd423fcabahrens for (i = 0; i < len; i++) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
fa9e4066f08beec538e775443c5be79dd423fcabahrens zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
fa9e4066f08beec538e775443c5be79dd423fcabahrens zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
fa9e4066f08beec538e775443c5be79dd423fcabahrens zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
fa9e4066f08beec538e775443c5be79dd423fcabahrens zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* the ptrtbl is entirely in the header block. */
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap_stats_ptrtbl(zap, zap->zap_f.zap_phys->zap_leafs,
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << ZAP_BLOCK_SHIFT,
fa9e4066f08beec538e775443c5be79dd423fcabahrens for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;