zap.c revision 8248818d5849649ef734d62da097e90222a23763
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER START
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The contents of this file are subject to the terms of the
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Common Development and Distribution License (the "License").
fa9e4066f08beec538e775443c5be79dd423fcabahrens * You may not use this file except in compliance with the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
fa9e4066f08beec538e775443c5be79dd423fcabahrens * See the License for the specific language governing permissions
fa9e4066f08beec538e775443c5be79dd423fcabahrens * and limitations under the License.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * When distributing Covered Code, include this CDDL HEADER in each
fa9e4066f08beec538e775443c5be79dd423fcabahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * If applicable, add the following below this CDDL HEADER, with the
fa9e4066f08beec538e775443c5be79dd423fcabahrens * fields enclosed by brackets "[]" replaced with your own identifying
fa9e4066f08beec538e775443c5be79dd423fcabahrens * information: Portions Copyright [yyyy] [name of copyright owner]
fa9e4066f08beec538e775443c5be79dd423fcabahrens * CDDL HEADER END
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Use is subject to license terms.
fa9e4066f08beec538e775443c5be79dd423fcabahrens#pragma ident "%Z%%M% %I% %E% SMI"
fa9e4066f08beec538e775443c5be79dd423fcabahrens * This file contains the top half of the zfs directory structure
fa9e4066f08beec538e775443c5be79dd423fcabahrens * implementation. The bottom half is in zap_leaf.c.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The zdir is an extendable hash data structure. There is a table of
fa9e4066f08beec538e775443c5be79dd423fcabahrens * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
fa9e4066f08beec538e775443c5be79dd423fcabahrens * each a constant size and hold a variable number of directory entries.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * The pointer table holds a power of 2 number of pointers.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
fa9e4066f08beec538e775443c5be79dd423fcabahrens * by the pointer at index i in the table holds entries whose hash value
fa9e4066f08beec538e775443c5be79dd423fcabahrens * has a zd_prefix_len - bit prefix
fa9e4066f08beec538e775443c5be79dd423fcabahrensint fzap_default_block_shift = 14; /* 16k blocksize */
fa9e4066f08beec538e775443c5be79dd423fcabahrensstatic void zap_leaf_pageout(dmu_buf_t *db, void *vl);
fa9e4066f08beec538e775443c5be79dd423fcabahrensstatic uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* it's a ptrtbl block */
fa9e4066f08beec538e775443c5be79dd423fcabahrens mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
fa9e4066f08beec538e775443c5be79dd423fcabahrens * explicitly zero it since it might be coming from an
fa9e4066f08beec538e775443c5be79dd423fcabahrens * initialized microzap
fa9e4066f08beec538e775443c5be79dd423fcabahrens bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
fa9e4066f08beec538e775443c5be79dd423fcabahrens zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
fa9e4066f08beec538e775443c5be79dd423fcabahrens zp->zap_freeblk = 2; /* block 1 will be the first leaf */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* block 1 will be the first leaf */
fa9e4066f08beec538e775443c5be79dd423fcabahrens * set up block 1 - the first leaf
fa9e4066f08beec538e775443c5be79dd423fcabahrens VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (1);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Generic routines for dealing with the pointer & cookie tables.
fa9e4066f08beec538e775443c5be79dd423fcabahrens void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* hepb = half the number of entries in a block */
fa9e4066f08beec538e775443c5be79dd423fcabahrens newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Copy the ptrtbl from the old to new location.
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* first half of entries in old[b] go to new[2*b+0] */
fa9e4066f08beec538e775443c5be79dd423fcabahrens VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens transfer_func(db_old->db_data, db_new->db_data, hepb);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* second half of entries in old[b] go to new[2*b+1] */
fa9e4066f08beec538e775443c5be79dd423fcabahrens VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens (void) dmu_free_range(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens dprintf("finished; numblocks now %llu (%lluk entries)\n",
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * read the nextblk for the sake of i/o error checking,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * so that zap_table_load() will catch errors for
fa9e4066f08beec538e775443c5be79dd423fcabahrens * zap_table_store.
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Routines for growing the ptrtbl.
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
fa9e4066f08beec538e775443c5be79dd423fcabahrens for (i = 0; i < n; i++) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* In case things go horribly wrong. */
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens * We are outgrowing the "embedded" ptrtbl (the one
fa9e4066f08beec538e775443c5be79dd423fcabahrens * stored in the header block). Give it its own entire
fa9e4066f08beec538e775443c5be79dd423fcabahrens * block, which will double the size of the ptrtbl.
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
fa9e4066f08beec538e775443c5be79dd423fcabahrens db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
fa9e4066f08beec538e775443c5be79dd423fcabahrens zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
fa9e4066f08beec538e775443c5be79dd423fcabahrens VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
fa9e4066f08beec538e775443c5be79dd423fcabahrens l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
fa9e4066f08beec538e775443c5be79dd423fcabahrens winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Routines for obtaining zap_leaf_t's
fa9e4066f08beec538e775443c5be79dd423fcabahrens winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* someone else set it first */
fa9e4066f08beec538e775443c5be79dd423fcabahrens * lhr_pad was previously used for the next leaf in the leaf
fa9e4066f08beec538e775443c5be79dd423fcabahrens * chain. There should be no chained leafs (as we have removed
fa9e4066f08beec538e775443c5be79dd423fcabahrens * support for them).
fa9e4066f08beec538e775443c5be79dd423fcabahrens * There should be more hash entries than there can be
fa9e4066f08beec538e775443c5be79dd423fcabahrens * chunks to put in the hash table
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* The chunks should begin at the end of the hash table */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* The chunks should end at the end of the block */
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (l);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Must lock before dirtying, otherwise l->l_phys could change,
fa9e4066f08beec538e775443c5be79dd423fcabahrens * causing ASSERT below to fail.
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
fa9e4066f08beec538e775443c5be79dd423fcabahrens idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
fa9e4066f08beec538e775443c5be79dd423fcabahrens old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* We failed to upgrade, or need to grow the pointer table */
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* it split while our locks were down */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
fa9e4066f08beec538e775443c5be79dd423fcabahrens prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
fa9e4066f08beec538e775443c5be79dd423fcabahrens sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* check for i/o errors before doing zap_leaf_split */
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* set sibling pointers */
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* we want the sibling */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
fa9e4066f08beec538e775443c5be79dd423fcabahrens int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens * We are in the middle of growing the pointer table, or
fa9e4066f08beec538e775443c5be79dd423fcabahrens * this leaf will soon make us grow it.
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* could have finished growing while our locks were down */
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
fa9e4066f08beec538e775443c5be79dd423fcabahrensfzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* Only integer sizes supported by C */
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens * Routines for maniplulating attributes.
fa9e4066f08beec538e775443c5be79dd423fcabahrens uint64_t integer_size, uint64_t num_integers, void *buf)
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = fzap_checksize(name, integer_size, num_integers);
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_entry_read(&zeh, integer_size, num_integers, buf);
fa9e4066f08beec538e775443c5be79dd423fcabahrens ASSERT(fzap_checksize(name, integer_size, num_integers) == 0);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens int err = fzap_checksize(name, integer_size, num_integers);
fa9e4066f08beec538e775443c5be79dd423fcabahrens return (fzap_add_cd(zap, name, integer_size, num_integers,
fa9e4066f08beec538e775443c5be79dd423fcabahrens int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = fzap_checksize(name, integer_size, num_integers);
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* XXX If this leaf is chained, split it if we can. */
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_entry_update(&zeh, integer_size, num_integers, val);
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
fa9e4066f08beec538e775443c5be79dd423fcabahrensfzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
87e5029a3226958edab1512d6182bc74d8d80c9aahrens * Routines for iterating over the attributes.
fa9e4066f08beec538e775443c5be79dd423fcabahrensfzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
fa9e4066f08beec538e775443c5be79dd423fcabahrens /* retrieve the next entry at or after zc_hash/zc_cd */
87e5029a3226958edab1512d6182bc74d8d80c9aahrens /* if no entry, return ENOENT */
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
fa9e4066f08beec538e775443c5be79dd423fcabahrenszap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
fa9e4066f08beec538e775443c5be79dd423fcabahrens * NB: if a leaf has more pointers than an entire ptrtbl block
fa9e4066f08beec538e775443c5be79dd423fcabahrens * can hold, then it'll be accounted for more than once, since
fa9e4066f08beec538e775443c5be79dd423fcabahrens * we won't have lastblk.
fa9e4066f08beec538e775443c5be79dd423fcabahrens for (i = 0; i < len; i++) {
fa9e4066f08beec538e775443c5be79dd423fcabahrens err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
fa9e4066f08beec538e775443c5be79dd423fcabahrens if (err == 0) {
int err;
if (err == 0) {