zil.c revision d48e086f569202b5e178a2f514b8bd9d44c7efe2
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/resource.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
* that change the file system in memory with enough information
* to be able to replay them. These are stored in memory until
* either the DMU transaction group (txg) commits them to the stable pool
* and they can be discarded, or they are flushed to the stable log
* (also in the pool) due to a fsync, O_DSYNC or other synchronous
* requirement. In the event of a panic or power fail then those log
* records (transactions) are replayed.
*
* There is one ZIL per file system. Its on-disk (pool) format consists
* of 3 parts:
*
* - ZIL header
* - ZIL blocks
* - ZIL records
*
* A log record holds a system call transaction. Log blocks can
* hold many log records and the blocks are chained together.
* Each ZIL block contains a block pointer (blkptr_t) to the next
* ZIL block in the chain. The ZIL header points to the first
* block in the chain. Note there is not a fixed place in the pool
* to hold blocks. They are dynamically allocated and freed as
* needed from the blocks available. Figure X shows the ZIL structure:
*/
/*
* This global ZIL switch affects all pools
*/
int zil_disable = 0; /* disable intent logging */
/*
* Tunable parameter for debugging or performance analysis. Setting
* zfs_nocacheflush will cause corruption on power loss if a volatile
* out-of-order write cache is enabled.
*/
static kmem_cache_t *zil_lwb_cache;
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
static void
{
}
static void
{
avl_destroy(t);
}
static int
{
return (EEXIST);
return (0);
}
static zil_header_t *
{
}
static void
{
}
/*
* Read a log block, make sure it's valid, and byteswap it if necessary.
*/
static int
{
int error;
/*
* We shouldn't be doing any scrubbing while we're doing log
* replay, it's OK to not lock.
*/
if (error == 0) {
/*
* Validate the checksummed log block.
*
* Sequence numbers should be... sequential. The checksum
* verifier for the next block should be bp's checksum plus 1.
*
* Also check the log chain linkage and size used.
*/
}
if (error) {
}
}
return (error);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
* Return the highest sequence number.
*/
{
if (BP_IS_HOLE(&blk))
return (max_seq);
/*
* Starting at the block pointed to by zh_log we read the log chain.
* For each block in the chain we strongly check that block to
* ensure its validity. We stop when an invalid block is found.
* For each block pointer in the chain we call parse_blk_func().
* For each record in each valid block we call parse_lr_func().
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
for (;;) {
break;
if (parse_blk_func != NULL)
if (error)
break;
if (parse_lr_func == NULL) {
continue;
}
}
}
return (max_seq);
}
/* ARGSUSED */
static void
{
int err;
/*
* Claim log block if not already committed and not already claimed.
*/
}
}
static void
{
}
}
/* ARGSUSED */
static void
{
}
static void
{
/*
* If we previously claimed it, we need to free it.
*/
}
}
}
/*
* Create an on-disk intent log.
*/
static void
{
int error = 0;
/*
* Wait for any previous destroy to complete.
*/
/*
* If we don't already have an initial log block or we have one
* but it's the wrong endianness then allocate one.
*/
if (!BP_IS_HOLE(&blk)) {
}
if (error == 0)
}
/*
* Allocate a log write buffer (lwb) for the first log block.
*/
if (error == 0) {
}
/*
* If we just allocated the first log block, commit our transaction
* and wait for zil_sync() to stuff the block poiner into zh_log.
* (zh is part of the MOS, so we cannot modify it in open context.)
*/
}
}
/*
* In one tx, free all log blocks and clear the log header.
* If keep_first is set, then we're replaying a log with no content.
* We want to keep the first block, however, so that the first
* synchronous transaction doesn't require a txg_wait_synced()
* in zil_create(). We don't need to txg_wait_synced() here either
* when keep_first is set, because both zil_create() and zil_destroy()
* will wait for any in-progress destroys to complete.
*/
void
{
/*
* Wait for any previous destroy to complete.
*/
return;
}
} else {
ASSERT(!keep_first);
} else {
/*
* Would like to assert zil_empty() but that
* would force us to read the log chain which
* requires us to do I/O to the log. This is
* overkill since we really just want to destroy
* the chain anyway.
*/
if (!keep_first) {
}
}
}
}
/*
* return true if the initial log block is not valid
*/
static boolean_t
{
return (B_TRUE);
return (B_TRUE);
return (B_FALSE);
}
int
{
int error;
if (error) {
return (0);
}
}
/*
* Record here whether the zil has any records to replay.
* If the header block pointer is null or the block points
* to the stubby then we know there are no valid log records.
* We use the header to store this state as the the zilog gets
* freed later in dmu_objset_close().
* The flags (and the rest of the header fields) are cleared in
* zil_sync() as a result of a zil_destroy(), after replaying the log.
*
* Note, the intent log can be empty but still need the
* stubby to be claimed.
*/
}
/*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
* read only part of the log now (e.g. due to a missing device),
* but we can read the entire log later, we will not try to replay
* or destroy beyond the last block we successfully claimed.
*/
}
return (0);
}
/*
* Check the log by walking the log chain.
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
/* ARGSUSED */
int
{
char *lrbuf;
int error;
if (error) {
return (0);
}
if (BP_IS_HOLE(&blk)) {
return (0); /* no chain */
}
for (;;) {
if (error)
break;
}
return (0); /* normal end of chain */
return (error);
}
static int
{
return (-1);
return (1);
return (0);
}
void
{
int i;
if (zfs_nocacheflush)
return;
/*
* Even though we're zl_writer, we still need a lock because the
* zl_get_data() callbacks may have dmu_sync() done callbacks
* that will run concurrently.
*/
for (i = 0; i < ndvas; i++) {
}
}
}
void
{
/*
* We don't need zl_vdev_lock here because we're the zl_writer,
* and all zl_get_data() callbacks are done.
*/
if (avl_numnodes(t) == 0)
return;
}
/*
* Wait for all the flushes to complete. Not all devices actually
* support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
*/
}
/*
* Function called when a log block write completes
*/
static void
{
/*
* Ensure the lwb buffer pointer is cleared before releasing
* the txg. If we have had an allocation failure and
* the txg is waiting to sync then we want want zil_sync()
* to remove the lwb so that it's not picked up as the next new
* one in zil_commit_writer(). zil_sync() will only remove
* the lwb if lwb_buf is null.
*/
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
* which we allocated the next block sync. We still have the
* zl_lock to ensure zil_sync doesn't kmem free the lwb.
*/
}
/*
* Initialize the io for a log block.
*/
static void
{
}
}
}
/*
* Use the slog as long as the logbias is 'latency' and the current commit size
* is less than the limit or the total list size is less than 2X the limit.
* Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
*/
/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
{
int error;
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
* Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
*/
/*
* Pick a ZIL blocksize. We request a size that is the
* maximum of the previous used size, the current used size and
* the amount waiting in the queue.
*/
if (zil_blksz > ZIL_MAX_BLKSZ)
/* pass the old blkptr in order to spread log blocks across devs */
if (error) {
/*
* We dirty the dataset to ensure that zil_sync() will
* be called to remove this lwb from our zl_lwb_list.
* Failing to do so, may leave an lwb with a NULL lwb_buf
* hanging around on the zl_lwb_list.
*/
/*
* Since we've just experienced an allocation failure so we
* terminate the current lwb and send it on its way.
*/
/*
* By returning NULL the caller will call tx_wait_synced()
*/
return (NULL);
}
/*
* Allocate a new log write buffer (lwb).
*/
/*
* Put new lwb at the end of the log chain
*/
/* Record the block for later vdev flushing */
/*
* kick off the write for the old log block
*/
return (nlwb);
}
static lwb_t *
{
return (NULL);
else
dlen = 0;
/*
* If this record won't fit in the current log block, start a new one.
*/
return (NULL);
return (lwb);
}
}
/*
* Update the lrc_seq, to be log record sequence number. See zil.h
* Then copy the record to the log buffer.
*/
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
char *dbuf;
int error;
/* alignment is guaranteed */
if (dlen) {
} else {
}
return (lwb);
}
if (error) {
return (lwb);
}
}
}
return (lwb);
}
itx_t *
{
return (itx);
}
{
return (seq);
}
/*
* Free up all in-memory intent log transactions that have now been synced.
*/
static void
{
/* wait for a log writer to finish walking list */
}
/*
* Move the sync'd log transactions to a separate list so we can call
* kmem_free without holding the zl_lock.
*
* There is no need to set zl_writer as we don't drop zl_lock here
*/
}
/* destroy sync'd log transactions */
}
}
/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them.
*/
void
{
}
}
static void
{
uint64_t commit_seq = 0;
if (zilog->zl_suspend) {
} else {
/*
* Return if there's nothing to flush before we
* dirty the fs by calling zil_create()
*/
return;
}
}
}
/* Loop through in-memory log transactions filling log blocks. */
for (;;) {
/*
* Find the next itx to push:
* Push all transactions related to specified foid and all
* other transactions except TX_WRITE, TX_TRUNCATE,
* TX_SETATTR and TX_ACL for all other files.
*/
else
if (foid == 0) /* push all foids? */
break;
break;
case TX_SETATTR:
case TX_WRITE:
case TX_TRUNCATE:
case TX_ACL:
/* lr_foid is same offset for these records */
!= foid) {
continue; /* skip this record */
}
}
break;
}
break;
break;
}
/*
* Save the next pointer. Even though we soon drop
* zl_lock all threads that may change the list
* (another writer or zil_itx_clean) can't do so until
* they have zl_writer.
*/
}
/* determine commit sequence number */
if (itx)
else
/* write the last block out */
zilog->zl_cur_used = 0;
/*
* Wait if necessary for the log blocks to be on stable storage.
*/
if (zilog->zl_root_zio) {
}
zilog->zl_log_error = 0;
}
}
/*
* Push zfs transactions to stable storage up to the supplied sequence number.
* If foid is 0 push out all transactions, otherwise push only those
* for that file or might have been used to create that file.
*/
void
{
return;
return;
}
}
/* wake up others waiting on the commit */
}
/*
* Called in syncing context to free committed log blocks and update log header.
*/
void
{
/*
* We don't zero out zl_destroy_txg, so make sure we don't try
* to destroy it twice.
*/
return;
if (zilog->zl_keep_first) {
/*
* If this block was part of log chain that couldn't
* be claimed because a device was missing during
* zil_claim(), but that device later returns,
* then this block could erroneously appear valid.
* To guard against this, assign a new GUID to the new
* log chain so it doesn't matter what blk points to.
*/
}
}
break;
/*
* If we don't have anything left in the lwb list then
* we've had an allocation failure and we need to zero
* out the zil_header blkptr so that we don't end
* up freeing the same block twice.
*/
}
}
void
zil_init(void)
{
}
void
zil_fini(void)
{
}
void
{
}
zilog_t *
{
return (zilog);
}
void
{
}
}
/*
* Open an intent log.
*/
zilog_t *
{
return (zilog);
}
/*
* Close an intent log.
*/
void
{
/*
* If the log isn't already committed, mark the objset dirty
* (so zil_sync() will be called) and wait for that txg to sync.
*/
if (!zil_is_committed(zilog)) {
}
}
/*
* Suspend an intent log. While in suspended mode, we still honor
* synchronous semantics, but we rely on txg_wait_synced() to do it.
* We suspend the log briefly when taking a snapshot so that the snapshot
* contains all the data it's supposed to, and has an empty intent log.
*/
int
{
return (EBUSY);
}
if (zilog->zl_suspend++ != 0) {
/*
* Someone else already began a suspend.
* Just wait for them to finish.
*/
while (zilog->zl_suspending)
return (0);
}
/*
* Wait for any in-flight log writes to complete.
*/
return (0);
}
void
{
zilog->zl_suspend--;
}
/*
* Read in the data for the dmu_sync()ed block, and change the log
* record to write this whole block.
*/
void
{
/*
* If the blksz is zero then we must be replaying a log
* from an version prior to setting the blksize of null blocks.
* So we just zero the actual write size reqeusted.
*/
if (blksz == 0) {
return;
}
} else {
/*
* A subsequent write may have overwritten this block, in which
* case wbp may have been been freed and reallocated, and our
* read of wbp may fail with a checksum error. We can safely
* ignore this because the later write will provide the
* correct data.
*/
}
}
typedef struct zil_replay_arg {
void *zr_arg;
char *zr_lrbuf;
static void
{
char *name;
return;
return;
return;
/* Strip case-insensitive bit, still present in log record */
goto bad;
}
/*
* Make a copy of the data so we can revise and extend it.
*/
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
* However, the log is a mix of different data types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
/*
* We must now do two things atomically: replay this log record,
* and update the log header sequence number to reflect the fact that
* we did so. At the end of each replay function the sequence number
* is updated if we are in replay mode.
*/
/* Only byteswap (if needed) on the 1st pass. */
if (!error)
return;
/*
* The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error we try syncing out
* any removes then retry the transaction.
*/
if (pass == 1)
}
bad:
"dataset %s, seq 0x%llx, txtype %llu %s\n",
}
/* ARGSUSED */
static void
{
zilog->zl_replay_blks++;
}
/*
* If this dataset has a non-empty intent log, replay it and destroy it.
*/
void
{
return;
}
/*
* Wait for in-progress removes to sync before starting replay.
*/
zh->zh_claim_txg);
}
/*
* Report whether all transactions are committed
*/
int
{
int ret;
/* recent unpushed intent log transactions? */
goto out;
}
/* intent log never used? */
goto out;
}
/*
* more than 1 log buffer means zil_sync() hasn't yet freed
* entries after a txg has committed
*/
goto out;
}
out:
return (ret);
}
/* ARGSUSED */
int
{
int error;
if (error)
return (error);
if (zil_suspend(zilog) != 0)
else
return (error);
}