dbuf.c revision ab69d62f363ee70f4e4cded1092a68cc0c63136f
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * CDDL HEADER START
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * The contents of this file are subject to the terms of the
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * Common Development and Distribution License (the "License").
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * You may not use this file except in compliance with the License.
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
8cf870d281dc8c242f083d14dfef05f24aa5fceeJnRouvignac * See the License for the specific language governing permissions
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * and limitations under the License.
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * When distributing Covered Code, include this CDDL HEADER in each
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
8cf870d281dc8c242f083d14dfef05f24aa5fceeJnRouvignac * If applicable, add the following below this CDDL HEADER, with the
8cf870d281dc8c242f083d14dfef05f24aa5fceeJnRouvignac * fields enclosed by brackets "[]" replaced with your own identifying
8cf870d281dc8c242f083d14dfef05f24aa5fceeJnRouvignac * information: Portions Copyright [yyyy] [name of copyright owner]
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * CDDL HEADER END
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * Use is subject to license terms.
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnoustatic int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnoustatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson * Global data structures and functions for the dbuf cache.
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou/* ARGSUSED */
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac/* ARGSUSED */
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou * dbuf hash table routines
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilsondbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
2b44dc864c39ab42305a1c5973ae8c4097f9fbfcjarnou crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
4c858fe08bd39b55ff1b0690065035e6bf0eb64ematthew_swift crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
4c858fe08bd39b55ff1b0690065035e6bf0eb64ematthew_swift crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
4c858fe08bd39b55ff1b0690065035e6bf0eb64ematthew_swift#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignacdbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
d66a2b9575a8780402aa70f28df005dab1f66659JnRouvignac for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * Insert an entry into the hash table. If there is already an element
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignac * equal to elem in the hash table, then the already existing element
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * will be returned and the new element will not be inserted.
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * Otherwise returns NULL.
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson uint64_t hv = DBUF_HASH(os, obj, level, blkid);
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * Remove an entry from the hash table. This operation will
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * fail if there are any existing holds on the db.
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * We musn't hold db_mtx to maintin lock ordering:
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * DBUF_HASH_MUTEX > db_mtx.
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson if (db->db_level != 0 || db->db_evict_func == NULL)
f83953bfeedd3587565245c82e51a82b34dcee5fJnRouvignac * The hash table is big enough to fill all of physical memory
f83953bfeedd3587565245c82e51a82b34dcee5fJnRouvignac * with an average 4K block size. The table will take up
f83953bfeedd3587565245c82e51a82b34dcee5fJnRouvignac * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson /* XXX - we should really return an error instead of assert */
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac for (i = 0; i < DBUF_MUTEXES; i++)
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac for (i = 0; i < DBUF_MUTEXES; i++)
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
e7c279e6efae9a82d5a1b0e00ebe47818e6e9b39JnRouvignac * Other stuff.
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
85d34568b0085aaf3240eff3c046b9ba83d418beJnRouvignac * We can't assert that db_size matches dn_datablksz because it
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson * can be momentarily different when another thread is doing
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignac * dnode_set_blksz().
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignac if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson dbuf_dirty_record_t *dr = db->db_data_pending;
d48f9588ba8e48621f65fa1f6ff93b9c8b75da42matthew * It should only be modified in syncing context, so
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignac * make sure we only have one copy of the data.
7185b49f58c4cdb16d035ecc45e38ec9b1cd9bd0matthew_swift ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift /* verify db->db_blkptr */
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift /* db is pointed to by the dnode */
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift if (db->db.db_object == DMU_META_DNODE_OBJECT)
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift /* db is pointed to by an indirect block */
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift * dnode_grow_indblksz() can make this fail if we don't
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift * have the struct_rwlock. XXX indblksz no longer
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift * grows. safe to do this now?
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift db->db_state != DB_FILL && !dn->dn_free_txg) {
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift * If the blkptr isn't set but they have nonzero data,
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift * it had better be dirty, otherwise we'll lose that
1026051c3ab4054efc802d7b072c8333b53e6ec1matthew_swift * data when we evict this buffer.
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilsondbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilsondbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson * All reads are synchronous, so we must have a hold on the dbuf
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson if (db->db_level == 0 && db->db_freed_in_flight) {
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson /* we were freed in flight; disregard any error */
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson } else if (zio == NULL || zio->io_error == 0) {
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignacdbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignac /* We need the struct_rwlock to prevent db_blkptr from changing. */
069e5999a07d1eb00e6fa8cd78e8b6f8729791d1JnRouvignac db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
78179d674ec664a39f4b2d80a86c40df49cf1f5cnicolas.capponi@forgerock.com * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
78179d674ec664a39f4b2d80a86c40df49cf1f5cnicolas.capponi@forgerock.com * processes the delete record and clears the bp while we are waiting
78179d674ec664a39f4b2d80a86c40df49cf1f5cnicolas.capponi@forgerock.com * for the dn_mtx (resulting in a "no" from block_freed).
78179d674ec664a39f4b2d80a86c40df49cf1f5cnicolas.capponi@forgerock.com if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
78179d674ec664a39f4b2d80a86c40df49cf1f5cnicolas.capponi@forgerock.com (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
78179d674ec664a39f4b2d80a86c40df49cf1f5cnicolas.capponi@forgerock.com BP_IS_HOLE(db->db_blkptr)))) {
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dd2893207f24e5cc1406a8ad8f5f5598f3516385neil_a_wilson dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
78179d674ec664a39f4b2d80a86c40df49cf1f5cnicolas.capponi@forgerock.com bzero(db->db.db_data, db->db.db_size);
int err = 0;
int prefetch;
return (EIO);
if (prefetch)
if (prefetch)
if (!havepzio)
if (prefetch)
return (err);
if (birth_txg)
return (FALSE);
return (dr);
sizeof (dbuf_dirty_record_t),
return (dr);
} else if (do_free_accounting) {
if (drop_struct_lock)
if (parent_held)
if (drop_struct_lock)
return (dr);
if (!dbuf_gone)
return (ENOENT);
if (err)
return (err);
if (err) {
return (err);
static dmu_buf_impl_t *
return (db);
int blocksize =
return (odb);
return (db);
if (db)
if (db)
top:
int err;
if (fail_sparse) {
if (err) {
if (parent)
return (err);
return (err);
if (parent) {
goto top;
if (parent)
if (holds == 0) {
return (old_user_ptr);
(void) dsl_dataset_block_kill(
i--, dnp++) {
fill++;
int epbs =