/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Portions Copyright 2011 Martin Matuska
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/txg_impl.h>
#include <sys/dmu_impl.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
/*
* ZFS Transaction Groups
* ----------------------
*
* ZFS transaction groups are, as the name implies, groups of transactions
* that act on persistent state. ZFS asserts consistency at the granularity of
* these transaction groups. Each successive transaction group (txg) is
* assigned a 64-bit consecutive identifier. There are three active
* transaction group states: open, quiescing, or syncing. At any given time,
* there may be an active txg associated with each state; each active txg may
* either be processing, or blocked waiting to enter the next state. There may
* be up to three active txgs, and there is always a txg in the open state
* (though it may be blocked waiting to enter the quiescing state). In broad
* strokes, transactions -- operations that change in-memory structures -- are
* accepted into the txg in the open state, and are completed while the txg is
* in the open or quiescing states. The accumulated changes are written to
* disk in the syncing state.
*
* Open
*
* When a new txg becomes active, it first enters the open state. New
* transactions -- updates to in-memory structures -- are assigned to the
* currently open txg. There is always a txg in the open state so that ZFS can
* accept new changes (though the txg may refuse new changes if it has hit
* some limit). ZFS advances the open txg to the next state for a variety of
* reasons such as it hitting a time or size threshold, or the execution of an
* administrative action that must be completed in the syncing state.
*
* Quiescing
*
* After a txg exits the open state, it enters the quiescing state. The
* quiescing state is intended to provide a buffer between accepting new
* transactions in the open state and writing them out to stable storage in
* the syncing state. While quiescing, transactions can continue their
* operation without delaying either of the other states. Typically, a txg is
* in the quiescing state very briefly since the operations are bounded by
* software latencies rather than, say, slower I/O latencies. After all
* transactions complete, the txg is ready to enter the next state.
*
* Syncing
*
* In the syncing state, the in-memory state built up during the open and (to
* a lesser degree) the quiescing states is written to stable storage. The
* process of writing out modified data can, in turn modify more data. For
* example when we write new blocks, we need to allocate space for them; those
* allocations modify metadata (space maps)... which themselves must be
* written to stable storage. During the sync state, ZFS iterates, writing out
* data until it converges and all in-memory changes have been written out.
* The first such pass is the largest as it encompasses all the modified user
* data (as opposed to filesystem metadata). Subsequent passes typically have
* far less data to write as they consist exclusively of filesystem metadata.
*
* To ensure convergence, after a certain number of passes ZFS begins
* overwriting locations on stable storage that had been allocated earlier in
* the syncing state (and subsequently freed). ZFS usually allocates new
* blocks to optimize for large, continuous, writes. For the syncing state to
* converge however it must complete a pass where no new blocks are allocated
* since each allocation requires a modification of persistent metadata.
* Further, to hasten convergence, after a prescribed number of passes, ZFS
* also defers frees, and stops compressing.
*
* In addition to writing out user data, we must also execute synctasks during
* the syncing context. A synctask is the mechanism by which some
* administrative activities work such as creating and destroying snapshots or
* datasets. Note that when a synctask is initiated it enters the open txg,
* and ZFS then pushes that txg as quickly as possible to completion of the
* syncing state in order to reduce the latency of the administrative
* activity. To complete the syncing state, ZFS writes out a new uberblock,
* the root of the tree of blocks that comprise all state stored on the ZFS
* pool. Finally, if there is a quiesced txg waiting, we signal that it can
* now transition to the syncing state.
*/
/*
* Prepare the txg subsystem.
*/
void
{
int c;
for (c = 0; c < max_ncpus; c++) {
int i;
NULL);
for (i = 0; i < TXG_SIZE; i++) {
NULL);
sizeof (dmu_tx_callback_t),
}
}
}
/*
* Close down the txg subsystem.
*/
void
{
int c;
for (c = 0; c < max_ncpus; c++) {
int i;
for (i = 0; i < TXG_SIZE; i++) {
}
}
}
/*
* Start syncing transaction groups.
*/
void
{
/*
* The sync thread can need a larger-than-default stack size on
* 32-bit x86. This is due in part to nested pools and
* scrub_visitbp() recursion.
*/
}
static void
{
}
static void
{
tx->tx_threads--;
thread_exit();
}
static void
{
if (time)
ddi_get_lbolt() + time);
else
}
/*
* Stop syncing transaction groups.
*/
void
{
/*
* Finish off any work in progress.
*/
/*
* We need to ensure that we've vacated the deferred space_maps.
*/
/*
* Wake all sync threads and wait for them to die.
*/
while (tx->tx_threads != 0)
tx->tx_exiting = 0;
}
{
return (txg);
}
void
{
}
void
{
}
void
{
}
/*
* Blocks until all transactions in the group are committed.
*
* On return, the transaction group has reached a stable state in which it can
* then be passed off to the syncing context.
*/
static void
{
int c;
/*
* Grab all tc_open_locks so nobody else can get into this txg.
*/
for (c = 0; c < max_ncpus; c++)
tx->tx_open_txg++;
/*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
*/
for (c = 0; c < max_ncpus; c++)
/*
* Quiesce the transaction group by waiting for everyone to txg_exit().
*/
for (c = 0; c < max_ncpus; c++) {
}
}
static void
{
}
/*
* Dispatch the commit callbacks registered on this txg to worker threads.
*
* If no callbacks are registered for a given TXG, nothing happens.
* This function creates a taskq for the associated pool, if needed.
*/
static void
{
int c;
for (c = 0; c < max_ncpus; c++) {
/*
* No need to lock tx_cpu_t at this point, since this can
* only be called once a txg has been synced.
*/
continue;
/*
* Commit callback taskq hasn't been created yet.
*/
}
}
}
static void
{
for (;;) {
/*
* We sync when we're scanning, there's someone waiting
* on us, or the quiesce thread has handed off a txg to
* us, or we have reached our timeout.
*/
tx->tx_quiesced_txg == 0 &&
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
}
/*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
}
if (tx->tx_exiting)
/*
* Consume the quiesced txg which has been handed off to
* us. This may cause the quiescing thread to now be
* able to quiesce another txg, so we must signal it.
*/
tx->tx_quiesced_txg = 0;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
start = ddi_get_lbolt();
tx->tx_syncing_txg = 0;
/*
* Dispatch commit callbacks to worker threads.
*/
}
}
static void
{
for (;;) {
/*
* We quiesce when there's someone waiting on us.
* However, we can only have one txg in "quiescing" or
* "quiesced, waiting to sync" state. So we wait until
* the "quiesced, waiting to sync" txg has been consumed
* by the sync thread.
*/
while (!tx->tx_exiting &&
tx->tx_quiesced_txg != 0))
if (tx->tx_exiting)
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
/*
* Hand this txg off to the sync thread.
*/
}
}
/*
* Delay this thread by delay nanoseconds if we are still in the open
* transaction group and there is already a waiting txg quiescing or quiesced.
* Abort the delay if this txg stalls or enters the quiescing state.
*/
void
{
/* don't delay if this txg could transition to quiescing immediately */
return;
return;
}
}
}
void
{
if (txg == 0)
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
dprintf("broadcasting sync more "
"tx_synced=%llu waiting=%llu dp=%p\n",
}
}
void
{
if (txg == 0)
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
}
}
/*
* If there isn't a txg syncing or in the pipeline, push another txg through
* the pipeline by queiscing the open txg.
*/
void
{
if (tx->tx_syncing_txg == 0 &&
}
}
{
}
{
tx->tx_quiesced_txg != 0);
}
/*
* Per-txg object lists.
*/
void
{
int t;
for (t = 0; t < TXG_SIZE; t++)
}
void
{
int t;
for (t = 0; t < TXG_SIZE; t++)
}
{
}
/*
* Returns true if all txg lists are empty.
*
* Warning: this is inherently racy (an item could be added immediately after this
* function returns). We don't bother with the lock because it wouldn't change the
* semantics.
*/
{
for (int i = 0; i < TXG_SIZE; i++) {
if (!txg_list_empty(tl, i)) {
return (B_FALSE);
}
}
return (B_TRUE);
}
/*
* Add an entry to the list (unless it's already on the list).
* Returns B_TRUE if it was actually added.
*/
{
if (add) {
}
return (add);
}
/*
* Add an entry to the end of the list, unless it's already on the list.
* (walks list to find end)
* Returns B_TRUE if it was actually added.
*/
{
if (add) {
continue;
}
return (add);
}
/*
* Remove the head of the list and return it.
*/
void *
{
void *p = NULL;
}
return (p);
}
/*
* Remove a specific item from the list and return it.
*/
void *
{
return (p);
}
}
return (NULL);
}
{
}
/*
* Walk a txg list -- only safe if you know it's not changing.
*/
void *
{
}
void *
{
}