zil.c revision 55da60b91d96984f12de050ce428373ea25c7f35
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/zfs_context.h>
#include <sys/resource.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
* that change the file system in memory with enough information
* to be able to replay them. These are stored in memory until
* either the DMU transaction group (txg) commits them to the stable pool
* and they can be discarded, or they are flushed to the stable log
* (also in the pool) due to a fsync, O_DSYNC or other synchronous
* requirement. In the event of a panic or power fail then those log
* records (transactions) are replayed.
*
* There is one ZIL per file system. Its on-disk (pool) format consists
* of 3 parts:
*
* - ZIL header
* - ZIL blocks
* - ZIL records
*
* A log record holds a system call transaction. Log blocks can
* hold many log records and the blocks are chained together.
* Each ZIL block contains a block pointer (blkptr_t) to the next
* ZIL block in the chain. The ZIL header points to the first
* block in the chain. Note there is not a fixed place in the pool
* to hold blocks. They are dynamically allocated and freed as
* needed from the blocks available. Figure X shows the ZIL structure:
*/
/*
* This global ZIL switch affects all pools
*/
int zil_replay_disable = 0; /* disable intent logging replay */
/*
* Tunable parameter for debugging or performance analysis. Setting
* zfs_nocacheflush will cause corruption on power loss if a volatile
* out-of-order write cache is enabled.
*/
static kmem_cache_t *zil_lwb_cache;
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
static void
{
}
static void
{
avl_destroy(t);
}
int
{
return (EEXIST);
return (0);
}
static zil_header_t *
{
}
static void
{
}
/*
* Read a log block and make sure it's valid.
*/
static int
char **end)
{
int error;
if (error == 0) {
/*
* Validate the checksummed log block.
*
* Sequence numbers should be... sequential. The checksum
* verifier for the next block should be bp's checksum plus 1.
*
* Also check the log chain linkage and size used.
*/
} else {
}
} else {
} else {
}
}
}
return (error);
}
/*
* Read a TX_WRITE log data block.
*/
static int
{
int error;
if (BP_IS_HOLE(bp)) {
return (0);
}
if (error == 0) {
}
return (error);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
*/
int
{
uint64_t max_blk_seq = 0;
uint64_t max_lr_seq = 0;
int error = 0;
/*
* Old logs didn't record the maximum zh_claim_lr_seq.
*/
/*
* Starting at the block pointed to by zh_log we read the log chain.
* For each block in the chain we strongly check that block to
* ensure its validity. We stop when an invalid block is found.
* For each block pointer in the chain we call parse_blk_func().
* For each record in each valid block we call parse_lr_func().
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
int reclen;
char *end;
if (blk_seq > claim_blk_seq)
break;
break;
blk_count++;
break;
if (error)
break;
goto done;
goto done;
lr_count++;
}
}
done:
return (error);
}
static int
{
/*
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
return (0);
}
static int
{
int error;
return (0);
/*
* If the block is not readable, don't claim it. This can happen
* in normal operation when a log block is written to disk before
* some of the dmu_sync() blocks it points to. In this case, the
* transaction cannot have been committed to anyone (we would have
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
return (error);
}
/* ARGSUSED */
static int
{
return (0);
}
static int
{
/*
* If we previously claimed it, we need to free it.
*/
return (0);
}
static lwb_t *
{
} else {
}
return (lwb);
}
/*
* Create an on-disk intent log.
*/
static lwb_t *
{
int error = 0;
/*
* Wait for any previous destroy to complete.
*/
/*
* Allocate an initial log block if:
* - there isn't one already
* - the existing block is the wrong endianess
*/
if (!BP_IS_HOLE(&blk)) {
}
if (error == 0)
}
/*
* Allocate a log write buffer (lwb) for the first log block.
*/
if (error == 0)
/*
* If we just allocated the first log block, commit our transaction
* and wait for zil_sync() to stuff the block poiner into zh_log.
* (zh is part of the MOS, so we cannot modify it in open context.)
*/
}
return (lwb);
}
/*
* In one tx, free all log blocks and clear the log header.
* If keep_first is set, then we're replaying a log with no content.
* We want to keep the first block, however, so that the first
* synchronous transaction doesn't require a txg_wait_synced()
* in zil_create(). We don't need to txg_wait_synced() here either
* when keep_first is set, because both zil_create() and zil_destroy()
* will wait for any in-progress destroys to complete.
*/
void
{
/*
* Wait for any previous destroy to complete.
*/
return;
ASSERT(!keep_first);
}
} else if (!keep_first) {
}
}
int
{
int error;
if (error) {
return (0);
}
return (0);
}
/*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
* read only part of the log now (e.g. due to a missing device),
* but we can read the entire log later, we will not try to replay
* or destroy beyond the last block we successfully claimed.
*/
}
return (0);
}
/*
* Check the log by walking the log chain.
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
int
{
int error;
if (error) {
return (0);
}
/*
* Because tx == NULL, zil_claim_log_block() will not actually claim
* any blocks, but just determine whether it is possible to do so.
* In addition to checking the log chain, zil_claim_log_block()
* will invoke zio_claim() with a done func of spa_claim_notify(),
* which will update spa_max_claim_txg. See spa_load() for details.
*/
}
static int
{
return (-1);
return (1);
return (0);
}
void
{
int i;
if (zfs_nocacheflush)
return;
/*
* Even though we're zl_writer, we still need a lock because the
* zl_get_data() callbacks may have dmu_sync() done callbacks
* that will run concurrently.
*/
for (i = 0; i < ndvas; i++) {
}
}
}
void
{
/*
* We don't need zl_vdev_lock here because we're the zl_writer,
* and all zl_get_data() callbacks are done.
*/
if (avl_numnodes(t) == 0)
return;
}
/*
* Wait for all the flushes to complete. Not all devices actually
* support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
*/
}
/*
* Function called when a log block write completes
*/
static void
{
/*
* Ensure the lwb buffer pointer is cleared before releasing
* the txg. If we have had an allocation failure and
* the txg is waiting to sync then we want want zil_sync()
* to remove the lwb so that it's not picked up as the next new
* one in zil_commit_writer(). zil_sync() will only remove
* the lwb if lwb_buf is null.
*/
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
* which we allocated the next block sync.
*/
}
/*
* Initialize the io for a log block.
*/
static void
{
}
}
}
/*
* Define a limited set of intent log block sizes.
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
* allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
*/
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
8192+4096, /* data base */
32*1024 + 4096, /* NFS writes */
};
/*
* Use the slog as long as the logbias is 'latency' and the current commit size
* is less than the limit or the total list size is less than 2X the limit.
* Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
*/
/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
{
int i, error;
} else {
}
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
* Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
* We dirty the dataset to ensure that zil_sync() will be called
* to clean up in the event of allocation failure or I/O failure.
*/
/*
* Log blocks are pre-allocated. Here we select the size of the next
* block, based on size used in the last block.
* - first find the smallest bucket that will fit the block from a
* limited set of block sizes. This is because it's faster to write
* blocks allocated from the same metaslab as they are adjacent or
* close.
* - next find the maximum from the new suggested size and an array of
* previous sizes. This lessens a picket fence effect of wrongly
* guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
* requests.
*
* Note we only write what is used, but we can't just allocate
* the maximum block size because we can exhaust the available
* pool log space.
*/
for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
continue;
zil_blksz = zil_block_buckets[i];
if (zil_blksz == UINT64_MAX)
for (i = 0; i < ZIL_PREV_BLKS; i++)
/* pass the old blkptr in order to spread log blocks across devs */
if (!error) {
/*
* Allocate a new log write buffer (lwb).
*/
/* Record the block for later vdev flushing */
}
/* For Slim ZIL only write what is used. */
} else {
}
/*
* clear unused data for security
*/
/*
* If there was an allocation failure then nlwb will be null which
* forces a txg_wait_synced().
*/
return (nlwb);
}
static lwb_t *
{
char *lr_buf;
return (NULL);
/*
* If this record won't fit in the current log block, start a new one.
*/
return (NULL);
return (lwb);
}
}
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
char *dbuf;
int error;
if (dlen) {
} else {
}
return (lwb);
}
if (error) {
return (lwb);
}
}
}
/*
* We're actually making an entry, so update lrc_seq to be the
* log record sequence number. Note that this is generally not
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
return (lwb);
}
itx_t *
{
return (itx);
}
void
{
}
{
return (seq);
}
/*
* Free up all in-memory intent log transactions that have now been synced.
*/
static void
{
/* wait for a log writer to finish walking list */
}
/*
* Move the sync'd log transactions to a separate list so we can call
* kmem_free without holding the zl_lock.
*
* There is no need to set zl_writer as we don't drop zl_lock here
*/
}
/* destroy sync'd log transactions */
}
}
/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them.
*/
void
{
}
}
static void
{
uint64_t commit_seq = 0;
int error = 0;
if (zilog->zl_suspend) {
} else {
/*
* Return if there's nothing to flush before we
* dirty the fs by calling zil_create()
*/
return;
}
}
}
/* Loop through in-memory log transactions filling log blocks. */
/*
* Save the next pointer. Even though we drop zl_lock below,
* all threads that can remove itx list entries (other writers
* and zil_itx_clean()) can't do so until they have zl_writer.
*/
/*
* Determine whether to push this itx.
* Push all transactions related to specified foid and
* all other transactions except those that can be logged
* out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
* for all other files.
*
* If foid == 0 (meaning "push all foids") or
* itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
*/
continue; /* skip this record */
break;
}
/* determine commit sequence number */
if (itx)
else
/* write the last block out */
zilog->zl_cur_used = 0;
/*
* Wait if necessary for the log blocks to be on stable storage.
*/
if (zilog->zl_root_zio) {
}
/*
* Remember the highest committed log sequence number for ztest.
* We only update this value when all the log writes succeeded,
* because ztest wants to ASSERT that it got the whole log chain.
*/
}
/*
* Push zfs transactions to stable storage up to the supplied sequence number.
* If foid is 0 push out all transactions, otherwise push only those
* for that file or might have been used to create that file.
*/
void
{
return;
return;
}
}
/* wake up others waiting on the commit */
}
/*
* Report whether all transactions are committed.
*/
static boolean_t
{
else
return (committed);
}
/*
* Called in syncing context to free committed log blocks and update log header.
*/
void
{
/*
* We don't zero out zl_destroy_txg, so make sure we don't try
* to destroy it twice.
*/
return;
if (*replayed_seq != 0) {
*replayed_seq = 0;
}
if (zilog->zl_keep_first) {
/*
* If this block was part of log chain that couldn't
* be claimed because a device was missing during
* zil_claim(), but that device later returns,
* then this block could erroneously appear valid.
* To guard against this, assign a new GUID to the new
* log chain so it doesn't matter what blk points to.
*/
}
}
break;
/*
* If we don't have anything left in the lwb list then
* we've had an allocation failure and we need to zero
* out the zil_header blkptr so that we don't end
* up freeing the same block twice.
*/
}
}
void
zil_init(void)
{
}
void
zil_fini(void)
{
}
void
{
}
void
{
}
zilog_t *
{
return (zilog);
}
void
{
}
}
/*
* Open an intent log.
*/
zilog_t *
{
return (zilog);
}
/*
* Close an intent log.
*/
void
{
/*
* If the log isn't already committed, mark the objset dirty
* (so zil_sync() will be called) and wait for that txg to sync.
*/
if (!zil_is_committed(zilog)) {
}
}
/*
* Suspend an intent log. While in suspended mode, we still honor
* synchronous semantics, but we rely on txg_wait_synced() to do it.
* We suspend the log briefly when taking a snapshot so that the snapshot
* contains all the data it's supposed to, and has an empty intent log.
*/
int
{
return (EBUSY);
}
if (zilog->zl_suspend++ != 0) {
/*
* Someone else already began a suspend.
* Just wait for them to finish.
*/
while (zilog->zl_suspending)
return (0);
}
/*
* Wait for any in-flight log writes to complete.
*/
return (0);
}
void
{
zilog->zl_suspend--;
}
typedef struct zil_replay_arg {
void *zr_arg;
char *zr_lr;
static int
{
char name[MAXNAMELEN];
return (error);
}
static int
{
int error = 0;
return (0);
return (0);
/* Strip case-insensitive bit, still present in log record */
/*
* If this record type can be logged out of order, the object
* (lr_foid) may no longer exist. That's legitimate, not an error.
*/
return (0);
}
/*
* Make a copy of the data so we can revise and extend it.
*/
/*
* If this is a TX_WRITE with a blkptr, suck in the data.
*/
if (error)
}
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
* However, the log is a mix of different record types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
/*
* We must now do two things atomically: replay this log record,
* and update the log header sequence number to reflect the fact that
* we did so. At the end of each replay function the sequence number
* is updated if we are in replay mode.
*/
if (error) {
/*
* The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error we try syncing out
* any removes then retry the transaction. Note that we
* specify B_FALSE for byteswap now, so we don't do it twice.
*/
if (error)
}
return (0);
}
/* ARGSUSED */
static int
{
zilog->zl_replay_blks++;
return (0);
}
/*
* If this dataset has a non-empty intent log, replay it and destroy it.
*/
void
{
return;
}
/*
* Wait for in-progress removes to sync before starting replay.
*/
zh->zh_claim_txg);
}
{
return (B_TRUE);
return (B_TRUE);
}
return (B_FALSE);
}
/* ARGSUSED */
int
{
int error;
if (error)
return (error);
if (zil_suspend(zilog) != 0)
else
return (error);
}