zil.c revision a6e57bd4c7a2bf9cc33be939d674d4c7d3e67cce
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/zfs_context.h>
#include <sys/resource.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
* that change the file system in memory with enough information
* to be able to replay them. These are stored in memory until
* either the DMU transaction group (txg) commits them to the stable pool
* and they can be discarded, or they are flushed to the stable log
* (also in the pool) due to a fsync, O_DSYNC or other synchronous
* requirement. In the event of a panic or power fail then those log
* records (transactions) are replayed.
*
* There is one ZIL per file system. Its on-disk (pool) format consists
* of 3 parts:
*
* - ZIL header
* - ZIL blocks
* - ZIL records
*
* A log record holds a system call transaction. Log blocks can
* hold many log records and the blocks are chained together.
* Each ZIL block contains a block pointer (blkptr_t) to the next
* ZIL block in the chain. The ZIL header points to the first
* block in the chain. Note there is not a fixed place in the pool
* to hold blocks. They are dynamically allocated and freed as
* needed from the blocks available. Figure X shows the ZIL structure:
*/
/*
* This global ZIL switch affects all pools
*/
int zil_disable = 0; /* disable intent logging */
/*
* Tunable parameter for debugging or performance analysis. Setting
* zfs_nocacheflush will cause corruption on power loss if a volatile
* out-of-order write cache is enabled.
*/
static kmem_cache_t *zil_lwb_cache;
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
static void
{
}
static void
{
avl_destroy(t);
}
static int
{
return (EEXIST);
return (0);
}
static zil_header_t *
{
}
static void
{
}
/*
* Read a log block, make sure it's valid, and byteswap it if necessary.
*/
static int
{
int error;
/*
* We shouldn't be doing any scrubbing while we're doing log
* replay, it's OK to not lock.
*/
if (error == 0) {
/*
* Validate the checksummed log block.
*
* Sequence numbers should be... sequential. The checksum
* verifier for the next block should be bp's checksum plus 1.
*
* Also check the log chain linkage and size used.
*/
}
if (error) {
}
}
return (error);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
* Return the highest sequence number.
*/
{
if (BP_IS_HOLE(&blk))
return (max_seq);
/*
* Starting at the block pointed to by zh_log we read the log chain.
* For each block in the chain we strongly check that block to
* ensure its validity. We stop when an invalid block is found.
* For each block pointer in the chain we call parse_blk_func().
* For each record in each valid block we call parse_lr_func().
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
for (;;) {
break;
if (parse_blk_func != NULL)
if (error)
break;
if (parse_lr_func == NULL) {
continue;
}
}
}
return (max_seq);
}
/* ARGSUSED */
static void
{
int err;
/*
* Claim log block if not already committed and not already claimed.
*/
}
}
static void
{
}
}
/* ARGSUSED */
static void
{
}
static void
{
/*
* If we previously claimed it, we need to free it.
*/
}
}
}
/*
* Create an on-disk intent log.
*/
static void
{
int error = 0;
/*
* Wait for any previous destroy to complete.
*/
/*
* If we don't already have an initial log block, allocate one now.
*/
if (BP_IS_HOLE(&blk)) {
if (error == 0)
}
/*
* Allocate a log write buffer (lwb) for the first log block.
*/
if (error == 0) {
}
/*
* If we just allocated the first log block, commit our transaction
* and wait for zil_sync() to stuff the block poiner into zh_log.
* (zh is part of the MOS, so we cannot modify it in open context.)
*/
}
}
/*
* In one tx, free all log blocks and clear the log header.
* If keep_first is set, then we're replaying a log with no content.
* We want to keep the first block, however, so that the first
* synchronous transaction doesn't require a txg_wait_synced()
* in zil_create(). We don't need to txg_wait_synced() here either
* when keep_first is set, because both zil_create() and zil_destroy()
* will wait for any in-progress destroys to complete.
*/
void
{
/*
* Wait for any previous destroy to complete.
*/
return;
/*
* It is possible for the ZIL to get the previously mounted zilog
* structure of the same dataset if quickly remounted and the dbuf
* eviction has not completed. In this case we can see a non
* empty lwb list and keep_first will be set. We fix this by
* clearing the keep_first. This will be slower but it's very rare.
*/
ASSERT(!keep_first);
}
} else {
if (!keep_first) {
}
}
}
/*
* zil_rollback_destroy() is only called by the rollback code.
* We already have a syncing tx. Rollback has exclusive access to the
* dataset, so we don't have to worry about concurrent zil access.
* The actual freeing of any log blocks occurs in zil_sync() later in
* this txg syncing phase.
*/
void
{
return;
/*
* Ensure there's no outstanding ZIL IO. No lwbs or just the
* unused one that allocated in advance is ok.
*/
}
int
{
int error;
if (error) {
return (0);
}
/*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
* read only part of the log now (e.g. due to a missing device),
* but we can read the entire log later, we will not try to replay
* or destroy beyond the last block we successfully claimed.
*/
}
return (0);
}
/*
* Check the log by walking the log chain.
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
/* ARGSUSED */
int
{
char *lrbuf;
int error;
if (error) {
return (0);
}
if (BP_IS_HOLE(&blk)) {
return (0); /* no chain */
}
for (;;) {
if (error)
break;
}
return (0); /* normal end of chain */
return (error);
}
/*
* Clear a log chain
*/
/* ARGSUSED */
int
{
int error;
if (error) {
return (0);
}
return (0);
}
static int
{
return (-1);
return (1);
return (0);
}
void
{
int i;
if (zfs_nocacheflush)
return;
/*
* Even though we're zl_writer, we still need a lock because the
* zl_get_data() callbacks may have dmu_sync() done callbacks
* that will run concurrently.
*/
for (i = 0; i < ndvas; i++) {
}
}
}
void
{
/*
* We don't need zl_vdev_lock here because we're the zl_writer,
* and all zl_get_data() callbacks are done.
*/
if (avl_numnodes(t) == 0)
return;
}
/*
* Wait for all the flushes to complete. Not all devices actually
* support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
*/
}
/*
* Function called when a log block write completes
*/
static void
{
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
* which we allocated the next block sync.
*/
}
/*
* Initialize the io for a log block.
*
* Note, we should not initialize the IO until we are about
* to use it, since zio_rewrite() does a spa_config_enter().
*/
static void
{
}
}
}
/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
{
int error;
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
* Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
*/
/*
* Pick a ZIL blocksize. We request a size that is the
* maximum of the previous used size, the current used size and
* the amount waiting in the queue.
*/
if (zil_blksz > ZIL_MAX_BLKSZ)
/* pass the old blkptr in order to spread log blocks across devs */
if (error) {
/*
* We dirty the dataset to ensure that zil_sync() will
* be called to remove this lwb from our zl_lwb_list.
* Failing to do so, may leave an lwb with a NULL lwb_buf
* hanging around on the zl_lwb_list.
*/
/*
* Since we've just experienced an allocation failure so we
* terminate the current lwb and send it on its way.
*/
/*
* By returning NULL the caller will call tx_wait_synced()
*/
return (NULL);
}
/*
* Allocate a new log write buffer (lwb).
*/
/*
* Put new lwb at the end of the log chain
*/
/* Record the block for later vdev flushing */
/*
* kick off the write for the old log block
*/
return (nlwb);
}
static lwb_t *
{
return (NULL);
else
dlen = 0;
/*
* If this record won't fit in the current log block, start a new one.
*/
return (NULL);
return (lwb);
}
}
/*
* Update the lrc_seq, to be log record sequence number. See zil.h
* Then copy the record to the log buffer.
*/
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
char *dbuf;
int error;
/* alignment is guaranteed */
if (dlen) {
} else {
}
if (error) {
return (lwb);
}
}
}
return (lwb);
}
itx_t *
{
return (itx);
}
{
return (seq);
}
/*
* Free up all in-memory intent log transactions that have now been synced.
*/
static void
{
/* wait for a log writer to finish walking list */
}
/*
* Move the sync'd log transactions to a separate list so we can call
* kmem_free without holding the zl_lock.
*
* There is no need to set zl_writer as we don't drop zl_lock here
*/
}
/* destroy sync'd log transactions */
}
}
/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them.
*/
void
{
}
}
void
{
uint64_t commit_seq = 0;
if (zilog->zl_suspend) {
} else {
/*
* Return if there's nothing to flush before we
* dirty the fs by calling zil_create()
*/
return;
}
}
}
/* Loop through in-memory log transactions filling log blocks. */
for (;;) {
/*
* Find the next itx to push:
* Push all transactions related to specified foid and all
* other transactions except TX_WRITE, TX_TRUNCATE,
* TX_SETATTR and TX_ACL for all other files.
*/
else
if (foid == 0) /* push all foids? */
break;
break;
case TX_SETATTR:
case TX_WRITE:
case TX_TRUNCATE:
case TX_ACL:
/* lr_foid is same offset for these records */
!= foid) {
continue; /* skip this record */
}
}
break;
}
break;
break;
}
/*
* Save the next pointer. Even though we soon drop
* zl_lock all threads that may change the list
* (another writer or zil_itx_clean) can't do so until
* they have zl_writer.
*/
}
/* determine commit sequence number */
if (itx)
else
/* write the last block out */
zilog->zl_cur_used = 0;
/*
* Wait if necessary for the log blocks to be on stable storage.
*/
if (zilog->zl_root_zio) {
}
zilog->zl_log_error = 0;
}
}
/*
* Push zfs transactions to stable storage up to the supplied sequence number.
* If foid is 0 push out all transactions, otherwise push only those
* for that file or might have been used to create that file.
*/
void
{
return;
return;
}
}
/* wake up others waiting on the commit */
}
/*
* Called in syncing context to free committed log blocks and update log header.
*/
void
{
if (zilog->zl_keep_first) {
/*
* If this block was part of log chain that couldn't
* be claimed because a device was missing during
* zil_claim(), but that device later returns,
* then this block could erroneously appear valid.
* To guard against this, assign a new GUID to the new
* log chain so it doesn't matter what blk points to.
*/
}
}
for (;;) {
return;
}
break;
/*
* If we don't have anything left in the lwb list then
* we've had an allocation failure and we need to zero
* out the zil_header blkptr so that we don't end
* up freeing the same block twice.
*/
}
}
void
zil_init(void)
{
}
void
zil_fini(void)
{
}
zilog_t *
{
return (zilog);
}
void
{
}
}
/*
* return true if the initial log block is not valid
*/
static boolean_t
{
return (B_TRUE);
return (B_TRUE);
return (B_FALSE);
}
/*
* Open an intent log.
*/
zilog_t *
{
return (zilog);
}
/*
* Close an intent log.
*/
void
{
/*
* If the log isn't already committed, mark the objset dirty
* (so zil_sync() will be called) and wait for that txg to sync.
*/
if (!zil_is_committed(zilog)) {
}
}
/*
* Suspend an intent log. While in suspended mode, we still honor
* synchronous semantics, but we rely on txg_wait_synced() to do it.
* We suspend the log briefly when taking a snapshot so that the snapshot
* contains all the data it's supposed to, and has an empty intent log.
*/
int
{
return (EBUSY);
}
if (zilog->zl_suspend++ != 0) {
/*
* Someone else already began a suspend.
* Just wait for them to finish.
*/
while (zilog->zl_suspending)
return (0);
}
/*
* Wait for any in-flight log writes to complete.
*/
return (0);
}
void
{
zilog->zl_suspend--;
}
typedef struct zil_replay_arg {
void *zr_arg;
char *zr_lrbuf;
static void
{
char *name;
if (zilog->zl_stop_replay)
return;
return;
return;
/* Strip case-insensitive bit, still present in log record */
/*
* Make a copy of the data so we can revise and extend it.
*/
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
* However, the log is a mix of different data types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
/*
* If this is a TX_WRITE with a blkptr, suck in the data.
*/
} else {
/*
* A subsequent write may have overwritten this block,
* in which case wbp may have been been freed and
* reallocated, and our read of wbp may fail with a
* checksum error. We can safely ignore this because
* the later write will provide the correct data.
*/
}
}
/*
* We must now do two things atomically: replay this log record,
* and update the log header to reflect the fact that we did so.
* We use the DMU's ability to assign into a specific txg to do this.
*/
if (error) {
break;
}
} else {
/*
* On the first pass, arrange for the replay vector
* to fail its dmu_tx_assign(). That's the only way
* to ensure that those code paths remain well tested.
*
* Only byteswap (if needed) on the 1st pass.
*/
}
if (error == 0) {
}
if (!error)
return;
/*
* The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error other than ERESTART
* we try syncing out any removes then retrying the
* transaction.
*/
if (zr->zr_replay_cleaner)
continue; /* retry */
}
break;
if (pass != 1)
replay_txg + 1);
}
"dataset %s, seq 0x%llx, txtype %llu %s\n",
}
/* ARGSUSED */
static void
{
zilog->zl_replay_blks++;
}
/*
* If this dataset has a non-empty intent log, replay it and destroy it.
*/
void
{
return;
}
/*
* Wait for in-progress removes to sync before starting replay.
*/
zilog->zl_stop_replay = 0;
zh->zh_claim_txg);
}
/*
* Report whether all transactions are committed
*/
int
{
int ret;
/* recent unpushed intent log transactions? */
goto out;
}
/* intent log never used? */
goto out;
}
/*
* more than 1 log buffer means zil_sync() hasn't yet freed
* entries after a txg has committed
*/
goto out;
}
out:
return (ret);
}