zil.c revision 104e2ed78d9ef0a0f89f320108b8ca29ca3850d5
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
#include <sys/resource.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
* that change the file system in memory with enough information
* to be able to replay them. These are stored in memory until
* either the DMU transaction group (txg) commits them to the stable pool
* and they can be discarded, or they are flushed to the stable log
* (also in the pool) due to a fsync, O_DSYNC or other synchronous
* requirement. In the event of a panic or power fail then those log
* records (transactions) are replayed.
*
* There is one ZIL per file system. Its on-disk (pool) format consists
* of 3 parts:
*
* - ZIL header
* - ZIL blocks
* - ZIL records
*
* A log record holds a system call transaction. Log blocks can
* hold many log records and the blocks are chained together.
* Each ZIL block contains a block pointer (blkptr_t) to the next
* ZIL block in the chain. The ZIL header points to the first
* block in the chain. Note there is not a fixed place in the pool
* to hold blocks. They are dynamically allocated and freed as
* needed from the blocks available. Figure X shows the ZIL structure:
*/
/*
* These global ZIL switches affect all pools
*/
int zil_disable = 0; /* disable intent logging */
int zil_always = 0; /* make every transaction synchronous */
int zil_purge = 0; /* at pool open, just throw everything away */
int zil_noflush = 0; /* don't flush write cache buffers on disks */
static kmem_cache_t *zil_lwb_cache;
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
static void
{
}
static void
{
avl_destroy(t);
}
static int
{
return (EEXIST);
return (0);
}
/*
* Read a log block, make sure it's valid, and byteswap it if necessary.
*/
static int
{
int error;
if (error) {
return (error);
}
if (BP_SHOULD_BYTESWAP(bp))
/*
* Sequence numbers should be... sequential. The checksum verifier for
* the next block should be: <logid[0], logid[1], objset id, seq + 1>.
*/
return (ESTALE);
}
return (ENOENT);
}
return (EOVERFLOW);
}
return (0);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
*/
void
{
if (BP_IS_HOLE(&blk))
return;
/*
* Starting at the block pointed to by zh_log we read the log chain.
* For each block in the chain we strongly check that block to
* ensure its validity. We stop when an invalid block is found.
* For each block pointer in the chain we call parse_blk_func().
* For each record in each valid block we call parse_lr_func().
*/
for (;;) {
if (parse_blk_func != NULL)
if (error)
break;
if (parse_lr_func == NULL)
continue;
}
}
}
/* ARGSUSED */
static void
{
int err;
/*
* Claim log block if not already committed and not already claimed.
*/
}
}
static void
{
}
}
/* ARGSUSED */
static void
{
}
static void
{
/*
* If we previously claimed it, we need to free it.
*/
}
}
}
/*
* Create an on-disk intent log.
*/
static void
{
int error;
int no_blk;
/*
* Initialize the log header block.
*/
/*
* If we don't have a log block already then
* allocate the first log block and assign its checksum verifier.
*/
if (no_blk) {
} else {
error = 0;
}
if (error == 0) {
/*
* Allocate a log write buffer (lwb) for the first log block.
*/
}
if (no_blk)
}
/*
* In one tx, free all log blocks and clear the log header.
*/
void
{
return;
}
/*
* zil_sync clears the zil header as soon as the zl_destroy_txg commits
*/
}
void
{
int error;
if (error) {
return;
}
/*
* Claim all log blocks if we haven't already done so.
*/
}
}
void
{
zil_vdev_t *zv;
if (zil_noflush)
return;
}
void
{
if (zil_noflush)
return;
/*
* remove all chained entries <= seq with same vdev
*/
}
}
/* flush the write cache for this vdev */
}
/*
* Wait for all the flushes to complete. Not all devices actually
* support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
*/
}
}
/*
* Function called when a log block write completes
*/
static void
{
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
* which we allocated the next block sync.
*/
return;
}
/* There's an unwritten buffer in the chain before this one */
return;
}
/*
* We must also follow up the chain for already written buffers
* to see if we can set zl_ss_seq even higher.
*/
break;
/* lwb_seq will be zero if we've written an empty buffer */
}
}
}
/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
{
int error;
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
* Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
*/
/*
* Pick a ZIL blocksize. We request a size that is the
* maximum of the previous used size, the current used size and
* the amount waiting in the queue.
*/
if (zil_blksz > ZIL_MAX_BLKSZ)
if (error) {
/*
* Reinitialise the lwb.
* By returning NULL the caller will call tx_wait_synced()
*/
return (NULL);
}
/*
* Allocate a new log write buffer (lwb).
*/
/*
* Put new lwb at the end of the log chain,
* and record the vdev for later flushing
*/
/*
* write the old log block
*/
return (nlwb);
}
static lwb_t *
{
lr_write_t *lr;
char *dbuf;
int error;
return (NULL);
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
sizeof (uint64_t));
/* on memory shortage use dmu_sync */
dlen = 0;
}
} else {
}
if (error) {
if (dlen)
txg);
return (lwb);
}
return (lwb);
}
}
}
/*
* If this record won't fit in the current log block, start a new one.
*/
if (dlen)
return (NULL);
}
if (dlen)
return (lwb);
}
}
if (dlen) {
}
return (lwb);
}
itx_t *
{
return (itx);
}
{
return (seq);
}
/*
* Free up all in-memory intent log transactions that have now been synced.
*/
static void
{
}
}
}
void
{
/*
* Check for any log blocks that can be freed.
* Log blocks are only freed when the log block allocation and
* log records contained within are both known to be committed.
*/
}
/*
* Push zfs transactions to stable storage up to the supplied sequence number.
*/
void
{
return;
for (;;) {
return;
}
break;
}
max_seq = 0;
if (zilog->zl_suspend) {
} else {
}
}
/*
* Loop through in-memory log transactions filling log blocks,
* until we reach the given sequence number and there's no more
* room in the write buffer.
*/
for (;;) {
break;
ZIL_BLK_DATA_SZ(lwb))))
break;
else
}
/* write the last block out */
zilog->zl_cur_used = 0;
}
/*
* Wait if necessary for our seq to be committed.
*/
if (lwb) {
}
zilog->zl_log_error = 0;
}
/* wake up others waiting to start a write */
}
/*
* Called in syncing context to free committed log blocks and update log header.
*/
void
{
zilog->zl_destroy_txg = 0;
}
for (;;) {
return;
}
break;
}
}
void
zil_init(void)
{
}
void
zil_fini(void)
{
}
zilog_t *
{
return (zilog);
}
void
{
zil_vdev_t *zv;
}
}
}
/*
* return true if the initial log block is not valid
*/
static int
{
char *lrbuf;
int error;
if (BP_IS_HOLE(&blk))
return (1);
return (error ? 1 : 0);
}
/*
* Open an intent log.
*/
zilog_t *
{
return (zilog);
}
/*
* Close an intent log.
*/
void
{
if (!zil_is_committed(zilog))
}
/*
* Suspend an intent log. While in suspended mode, we still honor
* synchronous semantics, but we rely on txg_wait_synced() to do it.
* We suspend the log briefly when taking a snapshot so that the snapshot
* contains all the data it's supposed to, and has an empty intent log.
*/
int
{
return (EBUSY);
}
zilog->zl_suspend++;
/*
* Wait for the buffer if it's in the process of
* being written.
*/
continue;
}
}
}
return (0);
}
void
{
zilog->zl_suspend--;
}
typedef struct zil_replay_arg {
void *zr_arg;
void (*zr_rm_sync)(void *arg);
char *zr_lrbuf;
static void
{
if (zilog->zl_stop_replay)
return;
return;
return;
/*
* Make a copy of the data so we can revise and extend it.
*/
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
* However, the log is a mix of different data types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
/*
* If this is a TX_WRITE with a blkptr, suck in the data.
*/
} else {
/*
* A subsequent write may have overwritten this block,
* in which case wbp may have been been freed and
* reallocated, and our read of wbp may fail with a
* checksum error. We can safely ignore this because
* the later write will provide the correct data.
*/
}
}
/*
* We must now do two things atomically: replay this log record,
* and update the log header to reflect the fact that we did so.
* We use the DMU's ability to assign into a specific txg to do this.
*/
if (error) {
break;
}
} else {
/*
* On the first pass, arrange for the replay vector
* to fail its dmu_tx_assign(). That's the only way
* to ensure that those code paths remain well tested.
*/
zr->zr_byteswap);
}
if (error == 0) {
}
break;
if (pass != 1)
replay_txg + 1);
}
if (error) {
"dataset %s, seq 0x%llx, txtype %llu\n",
}
/*
* The DMU's dnode layer doesn't see removes until the txg commits,
* so a subsequent claim can spuriously fail with EEXIST.
* To prevent this, if we might have removed an object,
* wait for the delete thread to delete it, and then
* wait for the transaction group to sync.
*/
}
}
/*
* If this dataset has a non-empty intent log, replay it and destroy it.
*/
void
{
/*
* Initialise the log header but don't free the log block
* which will get reused.
*/
return;
}
/*
* Wait for in-progress removes to sync before starting replay.
*/
zilog->zl_stop_replay = 0;
}
/*
* Report whether all transactions are committed
*/
int
{
return (B_FALSE);
/*
* A log write buffer at the head of the list that is not UNWRITTEN
* means there's a lwb yet to be freed after a txg commit
*/
return (B_FALSE);
return (B_TRUE);
}