lufsboot.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/param.h>
#include <sys/vnode.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_log.h>
#include <sys/sysmacros.h>
#include <sys/promif.h>
#include <sys/machparam.h>
#include <sys/stat.h>
#include <sys/bootdebug.h>
#include <sys/salib.h>
#include <sys/saio.h>
#include <sys/filep.h>
/*
* Big theory statement on how ufsboot makes use of the log
* in case the filesystem wasn't shut down cleanly.
*
* The structure of the ufs on-disk log looks like this:
*
* +-----------------+
* | SUPERBLOCK |
* | ... |
* | fs_logbno +--> +-----------------------+
* | ... | | EXTENT BLOCK |
* +-----------------+ | ... |
* | nextents |
* +----------------------+ extents[0].pbno |
* | | { extents[1].pbno } +------------+
* | | ... +--> ... |
* | +-----------------------+ |
* v |
* +-----------------------------+ \ |
* | ON-DISK LOG HEADER | | |
* | ... | | |
* | od_head_lof +--+ | |
* | ... | | | |
* +-----------------------------+ <|---|- od_bol_lof |
* | sector (may contain deltas) | | | (logical offset) |
* | +-------------------------+ | | |
* | | trailer (some ident#) | | > extents[0].nbno |
* +---+-------------------------+ | | blocks ("sectors") |
* . . | | |
* . . | | |
* +-----------------------------+<-+ | |
* | delta1 delta2 delta3 | | |
* | d +-------------------------+ | |
* | e | ident#: od_head_ident | | |
* +---+-------------------------+ / |
* |
* +-----------------------------+ <---------------------------+
* | lta4 delta5 delta6 de |
* | l +-------------------------+
* | t | ident#: od_head_ident+1 |
* +---+-------------------------+
* . .
* +-----------------------------+
* | sector (may contain deltas) |
* | +------------------+
* | | trailer (ident#) |
* +----------+------------------+ <-- od_eol_lof (logical offset)
*
* The ufs on-disk log has the following properties:
*
* 1. The log is made up from at least one extent. "fs_logbno" in
* the superblock points to where this is found.
* 2. Extents describe the logical layout.
* - Logical offset 0 is the on-disk log header. It's also
* at the beginning of the first physical block.
* - If there's more than one extent, the equation holds:
* extent[i+1].lbno == extent[i].lbno + extent[i].nbno
* i.e. logical offsets form a contiguous sequence. Yet on disk,
* two logically-adjacent offsets may be located in two
* physically disjoint extents, so logical offsets need to be
* translated into physical disk block addresses for access.
* - Various fields in the on-disk log header structure refer
* to such logical log offsets.
* 3. The actual logical logspace begins after the log header, at
* the logical offset indicated by "od_bol_lof". Every 512 Bytes
* (a "sector" in terms of ufs logging) is a sector trailer which
* contains a sequence number, the sector ident.
* 4. Deltas are packed tight in the remaining space, i.e. a delta
* may be part of more than one sector. Reads from the logspace
* must be split at sector boundaries, since the trailer is never
* part of a delta. Delta sizes vary.
* 5. The field "od_head_lof" points to the start of the dirty part
* of the log, i.e. to the first delta header. Likewise, "od_head_ident"
* is the sequence number where the valid part of the log starts; if
* the sector pointed to by "od_head_lof" has a sector ident different
* from "od_head_ident", the log is empty.
* 6. The valid part of the log extends for as many sectors as their ident
* numbers form a contiguous sequence. When reaching the logical end of
* the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof",
* i.e. the log forms a circular buffer.
*
* For the strategy how to handle accessing the log, item 4. is the
* most important one - its consequence is that the log can only be
* read in one direction - forward, starting at the head.
*
* The task of identifying whether a given metadata block is
* actually in the log therefore requires reading the entire
* log. Doing so is memory-efficient but kills speed if re-done
* at every metadata read (64MB log size vs. 512 byte metadata
* block size: 128 times as much I/O, possibly only to find out
* that this block was not in the log ...).
*
* First thought to speed this up is to let ufsboot roll the log.
* But this is not possible because:
* - ufsboot currently does not implement any write functionality,
* the boot-time ufs implementation is read-only.
* - firmware write interfaces may or may not be available, in any
* case, they're rarely used and untested for such a purpose.
* - that would duplicate a lot of code, since at the moment only
* kernel ufs logging implements log rolling.
* - the boot environment cannot be considered high-performance;
* rolling the log there would be slow.
* - boot device and root device could well be different, creating
* inconsistencies e.g. with a mirrored root if the log is rolled.
*
* Therefore, caching the log structural information (boot-relevant
* deltas and their logical log offset) is required for fast access
* to the data in the log. This code builds a logmap for that purpose.
*
* As a simple optimization, if we find the log is empty, we will not
* use it - log reader support for ufsboot has no noticeable overhead
* for clean logs, or for root filesystems that aren't logging.
*/
#define LB_HASHSHIFT 13
#define LB_HASHSIZE (1 << LB_HASHSHIFT)
#define LB_HASHFUNC(mof) (((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1))
#define LOGBUF_MAXSIZE (8*1024*1024)
#define LOGBUF_MINSIZE (256*1024)
#define LOG_IS_EMPTY 0
#define LOG_IS_OK 1
#define LOG_IS_ERRORED 2
/*
* We build a hashed logmap of those while scanning the log.
* sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized
* resalloc'ed buffer can accomodate around ~500k of those;
* this is approximately the maximum amount of deltas we'll
* see if a 64MB ufs log is completely filled. We'll make no
* attempt to free and reallocate the resalloc'ed buffer if
* we overflow, as conservative sizing should make that an
* impossibility. A future enhancement may allocate memory
* here as needed - once the boot time memory allocator
* supports that.
*/
typedef struct lb_mapentry {
struct lb_mapentry *l_next; /* hash chaining */
struct lb_mapentry *l_prev; /* hash chaining */
int64_t l_mof; /* disk addr this delta is against */
int16_t l_nb; /* size of delta */
int16_t l_flags;
int32_t l_lof; /* log offset for delta header */
int32_t l_tid; /* transaction this delta is part of */
delta_t l_typ; /* see <sys/fs/ufs_trans.h> */
} lb_me_t;
#define LB_ISCANCELLED 1
#define inslist(lh, l) if ((*(lh))) { \
(*(lh))->l_prev->l_next = (l); \
(l)->l_next = (*(lh)); \
(l)->l_prev = (*(lh))->l_prev; \
(*(lh))->l_prev = (l); \
} else { \
(l)->l_next = (l); \
(l)->l_prev = (l); \
(*(lh)) = l; \
}
#define remlist(lh, l) \
if ((l)->l_next == (l)) { \
if (*(lh) != (l) || (l)->l_prev != (l)) \
dprintf("Logmap hash inconsistency.\n"); \
*(lh) = (lb_me_t *)NULL; \
} else { \
if (*(lh) == (l)) \
*(lh) = (l)->l_next; \
(l)->l_prev->l_next = (l)->l_next; \
(l)->l_next->l_prev = (l)->l_prev; \
}
#define lufs_alloc_me() \
(lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t))
extern int boothowto;
static int ufs_is_lufs = 0;
static fileid_t *logfp = (fileid_t *)NULL;
static extent_block_t *eb = (extent_block_t *)NULL;
static ml_odunit_t odi;
#ifndef i386
static char logbuffer_min[LOGBUF_MINSIZE];
#endif
static caddr_t logbuffer = (caddr_t)NULL;
static caddr_t elogbuffer = (caddr_t)NULL;
static caddr_t logbuf_curptr;
static lb_me_t **loghash = (lb_me_t **)NULL;
static lb_me_t *lfreelist;
static uint32_t curtid;
int lufs_support = 1;
void lufs_boot_init(fileid_t *);
void lufs_closeall(void);
void lufs_merge_deltas(fileid_t *);
static int lufs_logscan(void);
extern int diskread(fileid_t *filep);
extern caddr_t resalloc(enum RESOURCES, size_t, caddr_t, int);
#if defined(i386)
#define LOGBUF_BASEADDR ((caddr_t)(KERNEL_TEXT - LOGBUF_MAXSIZE))
#elif defined(__sparcv9)
#define LOGBUF_BASEADDR ((caddr_t)(SYSBASE - LOGBUF_MAXSIZE))
#endif
static int
lufs_alloc_logbuf(void)
{
/*
* Allocate memory for caching the log. Since the logbuffer can
* potentially exceed the boot scratch memory limit, we use resalloc
* directly, passing the allocation to the low-level boot-time
* backend allocator. The chosen VA range is the top end of
* the kernel's segmap segment, so we're not interfering
* with the kernel because segmap is created at a time when
* the 2nd-stage boot has already been unloaded and this VA
* range was given back.
*
* On sparc platforms, the kernel cannot recover the memory
* obtained from resalloc because the page structs are allocated
* before the call to BOP_QUIESCE. To avoid leaking this
* memory, the logbuffer is allocated from a small bss array
* that should hold the logmap except in the most extreme cases.
* If the bss array is too small, the logbuffer is extended
* from resalloc 1 page at a time.
*/
#ifdef i386
logbuffer = resalloc(RES_CHILDVIRT, LOGBUF_MAXSIZE,
LOGBUF_BASEADDR, 0UL);
elogbuffer = logbuffer+LOGBUF_MAXSIZE;
#else
logbuffer = logbuffer_min;
elogbuffer = logbuffer+LOGBUF_MINSIZE;
#endif
logbuf_curptr = logbuffer;
lfreelist = (lb_me_t *)NULL;
if (logbuffer == (caddr_t)NULL)
return (0);
dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n",
logbuffer, elogbuffer-logbuffer);
return (1);
}
static void
lufs_free_logbuf()
{
/*
* Solaris/x86 has no prom_free() routine at this time.
* Reclaiming the VA range below KERNEL_TEXT on Solaris/x86
* is done by the kernel startup itself, in hat_unload_prom()
* after the bootloader has been quiesced.
*
* Solaris on sparc has a prom_free() routine that will update
* the memlist properties to reflect the freeing of the
* logbuffer. However, the sparc kernel cannot recover
* the memory freed after the call to BOP_QUIESCE as the
* page struct have already been allocated. We call
* prom_free anyway so that the kernel can reclaim this
* memory in the future.
*/
#ifndef i386
if (logbuffer == LOGBUF_BASEADDR)
prom_free(logbuffer, elogbuffer-logbuffer);
#endif
logbuffer = (caddr_t)NULL;
}
static caddr_t
lufs_alloc_from_logbuf(size_t sz)
{
caddr_t tmpaddr;
lb_me_t *l;
/*
* Satisfy lb_me_t allocations from the freelist
* first if possible.
*/
if ((sz == sizeof (lb_me_t)) && lfreelist) {
l = lfreelist;
lfreelist = lfreelist->l_next;
return ((caddr_t)l);
}
if (elogbuffer < logbuf_curptr + sz) {
#ifdef i386
return ((caddr_t)NULL);
#else
caddr_t np;
size_t nsz;
/*
* Out of space in current chunk - try to add another.
*/
if (logbuffer == logbuffer_min) {
np = LOGBUF_BASEADDR;
} else {
np = elogbuffer;
}
nsz = roundup(sz, PAGESIZE);
if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) {
return ((caddr_t)NULL);
}
np = resalloc(RES_CHILDVIRT, nsz, np, 0UL);
if (np == (caddr_t)NULL) {
return ((caddr_t)NULL);
}
if (logbuffer == logbuffer_min)
logbuffer = LOGBUF_BASEADDR;
logbuf_curptr = np;
elogbuffer = logbuf_curptr + nsz;
#endif
}
tmpaddr = logbuf_curptr;
logbuf_curptr += sz;
bzero(tmpaddr, sz);
return (tmpaddr);
}
static int32_t
lufs_read_log(int32_t addr, caddr_t va, int nb)
{
int i, fastpath = 0;
daddr_t pblk, lblk;
sect_trailer_t *st;
uint32_t ident;
/*
* Fast path for skipping the read if no target buffer
* is specified. Don't do this for the initial scan.
*/
if (ufs_is_lufs && (va == (caddr_t)NULL))
fastpath = 1;
while (nb) {
/* log wraparound check */
if (addr == odi.od_eol_lof)
addr = odi.od_bol_lof;
if (fastpath)
goto read_done;
/*
* Translate logically-contiguous log offsets into physical
* block numbers. For a log consisting of a single extent:
* pbno = btodb(addr) - extents[0].lbno;
* Otherwise, search for the extent which contains addr.
*/
pblk = 0;
lblk = btodb(addr);
for (i = 0; i < eb->nextents; i++) {
if (lblk >= eb->extents[i].lbno &&
lblk < eb->extents[i].lbno +
eb->extents[i].nbno) {
pblk = lblk - eb->extents[i].lbno +
eb->extents[i].pbno;
break;
}
}
if (pblk == 0) {
/*
* block #0 can never be in a log extent since this
* block always contains the primary superblock copy.
*/
dprintf("No log extent found for log offset 0x%llx.\n",
addr);
return (0);
}
/*
* Check whether the block we want is cached from the last
* read. If not, read it in now.
*/
if (logfp->fi_blocknum != pblk) {
logfp->fi_blocknum = pblk;
logfp->fi_memp = logfp->fi_buf;
logfp->fi_count = DEV_BSIZE;
logfp->fi_offset = 0;
if (diskread(logfp)) {
dprintf("I/O error reading the ufs log" \
" at block 0x%x.\n",
logfp->fi_blocknum);
return (0);
}
/*
* Log structure verification. The block which we just
* read has an ident number that must match its offset
* in blocks from the head of the log. Since the log
* can wrap around, we have to check for that to get the
* ident right. Out-of-sequence idents can happen after
* power failures, panics during a partial transaction,
* media errors, ... - in any case, they mark the end of
* the valid part of the log.
*/
st = (sect_trailer_t *)(logfp->fi_memp +
LDL_USABLE_BSIZE);
/* od_head_ident is where the sequence starts */
ident = odi.od_head_ident;
if (lblk >= lbtodb(odi.od_head_lof)) {
/* no wraparound */
ident += (lblk - lbtodb(odi.od_head_lof));
} else {
/* log wrapped around the end */
ident += (lbtodb(odi.od_eol_lof) -
lbtodb(odi.od_head_lof));
ident += (lblk - lbtodb(odi.od_bol_lof));
}
if (ident != st->st_ident)
return (0);
}
read_done:
/*
* Copy the delta contents to the destination buffer if
* one was specified. Otherwise, just skip the contents.
*/
i = MIN(NB_LEFT_IN_SECTOR(addr), nb);
if (va != NULL) {
bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))),
va, i);
va += i;
}
nb -= i;
addr += i;
/*
* Skip sector trailer if necessary.
*/
if (NB_LEFT_IN_SECTOR(addr) == 0)
addr += sizeof (sect_trailer_t);
}
return (addr);
}
void
lufs_boot_init(fileid_t *filep)
{
struct fs *sb = (struct fs *)filep->fi_memp;
int err = 0;
/*
* boot_ufs_mountroot() should have called us with a
* filep pointing to the superblock. Verify that this
* is so first.
* Then check whether this filesystem has a dirty log.
* Also return if lufs support was disabled on request.
*/
if (!lufs_support ||
sb != (struct fs *)&filep->fi_devp->un_fs.di_fs ||
sb->fs_clean != FSLOG || sb->fs_logbno == NULL) {
return;
}
if (boothowto & RB_VERBOSE)
printf("The boot filesystem is logging.\n");
/*
* The filesystem is logging, there is a log area
* allocated for it. Check the log state and determine
* whether it'll be possible to use this log.
*/
/*
* Allocate a private fileid_t for use when reading
* from the log.
*/
eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize);
logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t));
logfp->fi_memp = logfp->fi_buf;
logfp->fi_devp = filep->fi_devp;
/*
* Read the extent block and verify that what we
* find there are actually lufs extents.
* Make it simple: the extent block including all
* extents cannot be larger than a filesystem block.
* So read a whole filesystem block, to make sure
* we have read all extents in the same operation.
*/
logfp->fi_blocknum = sb->fs_logbno;
logfp->fi_count = sb->fs_bsize;
logfp->fi_memp = (caddr_t)eb;
logfp->fi_offset = 0;
if (diskread(logfp) || eb->type != LUFS_EXTENTS) {
dprintf("Failed to read log extent block.\n");
err = LOG_IS_ERRORED;
goto out;
}
/*
* Read the on disk log header. If that fails,
* try the backup copy on the adjacent block.
*/
logfp->fi_blocknum = eb->extents[0].pbno;
logfp->fi_count = sizeof (ml_odunit_t);
logfp->fi_memp = (caddr_t)&odi;
logfp->fi_offset = 0;
if (diskread(logfp)) {
logfp->fi_blocknum = eb->extents[0].pbno + 1;
logfp->fi_count = sizeof (ml_odunit_t);
logfp->fi_memp = (caddr_t)&odi;
logfp->fi_offset = 0;
if (diskread(logfp)) {
dprintf("Failed to read on-disk log header.\n");
err = LOG_IS_ERRORED;
goto out;
}
}
/*
* Verify that we understand this log, and
* that the log isn't bad or empty.
*/
if (odi.od_version != LUFS_VERSION_LATEST) {
dprintf("On-disk log format v%d != supported format v%d.\n",
odi.od_version, LUFS_VERSION_LATEST);
err = LOG_IS_ERRORED;
} else if (odi.od_badlog) {
dprintf("On-disk log is marked bad.\n");
err = LOG_IS_ERRORED;
} else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) {
dprintf("On-disk log checksum %d != ident sum %d.\n",
odi.od_chksum, odi.od_head_ident + odi.od_tail_ident);
err = LOG_IS_ERRORED;
} else {
/*
* All consistency checks ok. Scan the log, build the
* log hash. If this succeeds we'll be using the log
* when reading from this filesystem.
*/
err = lufs_logscan();
}
out:
ufs_is_lufs = 1;
switch (err) {
case LOG_IS_EMPTY:
if (boothowto & RB_VERBOSE)
printf("The ufs log is empty and will not be used.\n");
lufs_closeall();
break;
case LOG_IS_OK:
if (boothowto & RB_VERBOSE)
printf("Using the ufs log.\n");
break;
case LOG_IS_ERRORED:
if (boothowto & RB_VERBOSE)
printf("Couldn't build log hash. Can't use ufs log.\n");
lufs_closeall();
break;
default:
dprintf("Invalid error %d while scanning the ufs log.\n", err);
break;
}
}
static int
lufs_logscan_read(int32_t *addr, struct delta *d)
{
*addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta));
if (*addr == 0 ||
d->d_typ < DT_NONE || d->d_typ > DT_MAX ||
d->d_nb >= odi.od_logsize)
return (0);
return (1);
}
static int
lufs_logscan_skip(int32_t *addr, struct delta *d)
{
switch (d->d_typ) {
case DT_COMMIT:
/*
* A DT_COMMIT delta has no size as such, but will
* always "fill up" the sector that contains it.
* The next delta header is found at the beginning
* of the next 512-Bytes sector, adjust "addr" to
* reflect that.
*/
*addr += ((*addr & (DEV_BSIZE - 1))) ?
NB_LEFT_IN_SECTOR(*addr) +
sizeof (sect_trailer_t) : 0;
return (1);
case DT_CANCEL:
case DT_ABZERO:
/*
* These types of deltas occupy no space in the log
*/
return (1);
default:
/*
* Skip over the delta contents.
*/
*addr = lufs_read_log(*addr, NULL, d->d_nb);
}
return (*addr != NULL);
}
static void
lufs_logscan_freecancel(void)
{
lb_me_t **lh, *l, *lnext;
int i;
/*
* Walk the entire log hash and put cancelled entries
* onto the freelist. Corner cases:
* a) empty hash chain (*lh == NULL)
* b) only one entry in chain, and that is cancelled.
* If for every cancelled delta another one would've
* been added, this situation couldn't occur, but a
* DT_CANCEL delta can lead to this as it is never
* added.
*/
for (i = 0; i < LB_HASHSIZE; i++) {
lh = &loghash[i];
l = *lh;
do {
if (*lh == (lb_me_t *)NULL)
break;
lnext = l->l_next;
if (l->l_flags & LB_ISCANCELLED) {
remlist(lh, l);
bzero((caddr_t)l, sizeof (lb_me_t));
l->l_next = lfreelist;
lfreelist = l;
/*
* Just removed the hash head. In order not
* to terminate the while loop, respin chain
* walk for this hash chain.
*/
if (lnext == *lh) {
i--;
break;
}
}
l = lnext;
} while (l != *lh);
}
}
static int
lufs_logscan_addmap(int32_t *addr, struct delta *d)
{
lb_me_t **lh, *l;
switch (d->d_typ) {
case DT_COMMIT:
/*
* Handling DT_COMMIT deltas is special. We need to:
* 1. increase the transaction ID
* 2. remove cancelled entries.
*/
lufs_logscan_freecancel();
curtid++;
break;
case DT_INODE:
/*
* Deltas against parts of on-disk inodes are
* assumed to be timestamps. Ignore those.
*/
if (d->d_nb != sizeof (struct dinode))
break;
/* FALLTHROUGH */
case DT_CANCEL:
case DT_ABZERO:
case DT_AB:
case DT_DIR:
case DT_FBI:
/*
* These types of deltas contain and/or modify structural
* information that is needed for booting the system:
* - where to find a file (DT_DIR, DT_FBI)
* - the file itself (DT_INODE)
* - data blocks associated with a file (DT_AB, DT_ABZERO)
*
* Building the hash chains becomes complicated because there
* may exist an older (== previously added) entry that overlaps
* with the one we want to add.
* Four cases must be distinguished:
* 1. The new delta is an exact match for an existing one,
* or is a superset of an existing one, and both
* belong to the same transaction.
* The new delta completely supersedes the old one, so
* remove that and reuse the structure for the new.
* Then add the new delta to the head of the hashchain.
* 2. The new delta is an exact match for an existing one,
* or is a superset of an existing one, but the two
* belong to different transactions (i.e. the old one is
* committed).
* The existing one is marked to be cancelled when the
* next DT_COMMIT record is found, and the hash chain
* walk is continued as there may be more existing entries
* found which overlap the new delta (happens if that is
* a superset of those in the log).
* Once no more overlaps are found, goto 4.
* 3. An existing entry completely covers the new one.
* The new delta is then added directly before this
* existing one.
* 4. No (more) overlaps with existing entries are found.
* Unless this is a DT_CANCEL delta, whose only purpose
* is already handled by marking overlapping entries for
* cancellation, add the new delta at the hash chain head.
*
* This strategy makes sure that the hash chains are properly
* ordered. lufs_merge_deltas() walks the hash chain backward,
* which then ensures that delta merging is done in the same
* order as those deltas occur in the log - remember, the
* log can only be read in one direction.
*
*/
lh = &loghash[LB_HASHFUNC(d->d_mof)];
l = *lh;
do {
if (l == (lb_me_t *)NULL)
break;
/*
* This covers the first two cases above.
* If this is a perfect match from the same transaction,
* and it isn't already cancelled, we simply replace it
* with its newer incarnation.
* Otherwise, mark it for cancellation. Handling of
* DT_COMMIT is going to remove it, then.
*/
if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) {
if (!(l->l_flags & LB_ISCANCELLED)) {
if (l->l_tid == curtid &&
d->d_typ != DT_CANCEL) {
remlist(lh, l);
l->l_mof = d->d_mof;
l->l_lof = *addr;
l->l_nb = d->d_nb;
l->l_typ = d->d_typ;
l->l_flags = 0;
l->l_tid = curtid;
inslist(lh, l);
return (1);
} else {
/*
* 2nd case - cancel only.
*/
l->l_flags |= LB_ISCANCELLED;
}
}
} else if (WITHIN(d->d_mof, d->d_nb,
l->l_mof, l->l_nb)) {
/*
* This is the third case above.
* With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR
* this may happen - an existing previous delta
* is larger than the current one we're planning
* to add - DT_ABZERO deltas are supersets of
* DT_AB deltas, and likewise DT_FBI/DT_DIR.
* In order to do merging correctly, such deltas
* put up a barrier for new ones that overlap,
* and we have to add the new delta immediately
* before (!) the existing one.
*/
lb_me_t *newl;
newl = lufs_alloc_me();
if (newl == (lb_me_t *)NULL) {
/*
* No memory. Throw away everything
* and try booting without logging
* support.
*/
curtid = 0;
return (0);
}
newl->l_mof = d->d_mof;
newl->l_lof = *addr; /* "payload" address */
newl->l_nb = d->d_nb;
newl->l_typ = d->d_typ;
newl->l_tid = curtid;
newl->l_prev = l->l_prev;
newl->l_next = l;
l->l_prev->l_next = newl;
l->l_prev = newl;
if (*lh == l)
*lh = newl;
return (1);
}
l = l->l_next;
} while (l != *lh);
/*
* This is case 4., add a new delta at the head of the chain.
*
* If the new delta is a DT_CANCEL entry, we handled it by
* marking everything it covered for cancellation. We can
* get by without actually adding the delta itself to the
* hash, as it'd need to be removed by the commit code anyway.
*/
if (d->d_typ == DT_CANCEL)
break;
l = lufs_alloc_me();
if (l == (lb_me_t *)NULL) {
/*
* No memory. Throw away everything
* and try booting without logging
* support.
*/
curtid = 0;
return (0);
}
l->l_mof = d->d_mof;
l->l_lof = *addr; /* this is the "payload" address */
l->l_nb = d->d_nb;
l->l_typ = d->d_typ;
l->l_tid = curtid;
inslist(lh, l);
break;
default:
break;
}
return (1);
}
static int
lufs_logscan_prescan(void)
{
/*
* Simulate a full log by setting the tail to be one sector
* behind the head. This will make the logscan read all
* of the log until an out-of-sequence sector ident is
* found.
*/
odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE;
if (odi.od_tail_lof < odi.od_bol_lof)
odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE;
if (odi.od_tail_lof >= odi.od_eol_lof)
odi.od_tail_lof = odi.od_bol_lof;
/*
* While sector trailers maintain TID values, od_head_tid
* is not being updated by the kernel ufs logging support
* at this time. We therefore count transactions ourselves
* starting at zero - as does the kernel ufs logscan code.
*/
curtid = 0;
if (!lufs_alloc_logbuf()) {
dprintf("Failed to allocate log buffer.\n");
return (0);
}
loghash = (lb_me_t **)lufs_alloc_from_logbuf(
LB_HASHSIZE * sizeof (lb_me_t *));
if (loghash == (lb_me_t **)NULL) {
dprintf("Can't allocate loghash[] array.");
return (0);
}
return (1);
}
/*
* This function must remove all uncommitted entries (l->l_tid == curtid)
* from the log hash. Doing this, we implicitly delete pending cancellations
* as well.
* It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only
* the check for entries that need to be removed is different.
*/
static void
lufs_logscan_postscan(void)
{
lb_me_t **lh, *l, *lnext;
int i;
for (i = 0; i < LB_HASHSIZE; i++) {
lh = &loghash[i];
l = *lh;
do {
if (l == (lb_me_t *)NULL)
break;
lnext = l->l_next;
if (l->l_tid == curtid) {
remlist(lh, l);
bzero((caddr_t)l, sizeof (lb_me_t));
l->l_next = lfreelist;
lfreelist = l;
if (*lh == (lb_me_t *)NULL)
break;
/*
* Just removed the hash head. In order not
* to terminate the while loop, respin chain
* walk for this hash chain.
*/
if (lnext == *lh) {
i--;
break;
}
} else {
l->l_flags &= ~(LB_ISCANCELLED);
}
l = lnext;
} while (l != *lh);
}
}
/*
* This function builds the log hash. It performs the same sequence
* of actions at logscan as the kernel ufs logging support:
* - Prepare the log for scanning by simulating a full log.
* - As long as sectors read from the log have contiguous idents, do:
* read the delta header
* add the delta to the logmap
* skip over the contents to the start of the next delta header
* - After terminating the scan, remove uncommitted entries.
*
* This function cannot fail except if mapping the logbuffer area
* during lufs_logscan_prescan() fails. If there is a structural
* integrity problem and the on-disk log cannot be read, we'll
* treat this as the same situation as an uncommitted transaction
* at the end of the log (or, corner case of that, an empty log
* with no committed transactions in it at all).
*
*/
static int
lufs_logscan(void)
{
int32_t addr;
struct delta d;
if (!lufs_logscan_prescan())
return (LOG_IS_ERRORED);
addr = odi.od_head_lof;
/*
* Note that addr == od_tail_lof means a completely filled
* log. This almost never happens, so the common exit path
* from this loop is via one of the 'break's.
*/
while (addr != odi.od_tail_lof) {
if (!lufs_logscan_read(&addr, &d))
break;
if (!lufs_logscan_addmap(&addr, &d))
return (LOG_IS_ERRORED);
if (!lufs_logscan_skip(&addr, &d))
break;
}
lufs_logscan_postscan();
/*
* Check whether the log contains data, and if so whether
* it contains committed data.
*/
if (addr == odi.od_head_lof || curtid == 0) {
return (LOG_IS_EMPTY);
}
return (LOG_IS_OK);
}
/*
* A metadata block was read from disk. Check whether the logmap
* has a delta against this byte range, and if so read it in, since
* the data in the log is more recent than what was read from other
* places on the disk.
*/
void
lufs_merge_deltas(fileid_t *fp)
{
int nb;
int64_t bof;
lb_me_t **lh, *l;
int32_t skip;
/*
* No logmap: Empty log. Nothing to do here.
*/
if (!ufs_is_lufs || logbuffer == (caddr_t)NULL)
return;
bof = ldbtob(fp->fi_blocknum);
nb = fp->fi_count;
/*
* Search the log hash.
* Merge deltas if an overlap is found.
*/
lh = &loghash[LB_HASHFUNC(bof)];
if (*lh == (lb_me_t *)NULL)
return;
l = *lh;
do {
l = l->l_prev;
if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) {
/*
* Found a delta in the log hash which overlaps
* with the current metadata block. Read the
* actual delta payload from the on-disk log
* directly into the file buffer.
*/
if (l->l_typ != DT_ABZERO) {
/*
* We have to actually read this part of the
* log as it could contain a sector trailer, or
* wrap around the end of the log.
* If it did, the second offset generation would
* be incorrect if we'd started at l->l_lof.
*/
if (!(skip = lufs_read_log(l->l_lof, NULL,
MAX(bof - l->l_mof, 0))))
dprintf("scan/merge error, pre-skip\n");
if (!(skip = lufs_read_log(skip,
fp->fi_memp + MAX(l->l_mof - bof, 0),
MIN(l->l_mof + l->l_nb, bof + nb) -
MAX(l->l_mof, bof))))
dprintf("scan/merge error, merge\n");
} else {
/*
* DT_ABZERO requires no disk access, just
* clear the byte range which overlaps with
* the delta.
*/
bzero(fp->fi_memp + MAX(l->l_mof - bof, 0),
MIN(l->l_mof + l->l_nb, bof + nb) -
MAX(l->l_mof, bof));
}
}
} while (l->l_prev != (*lh)->l_prev);
printf("*\b");
}
void
lufs_closeall(void)
{
if (ufs_is_lufs) {
bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize);
bkmem_free((char *)logfp, sizeof (fileid_t));
eb = (extent_block_t *)NULL;
bzero((caddr_t)&odi, sizeof (ml_odunit_t));
logfp = (fileid_t *)NULL;
lufs_free_logbuf();
ufs_is_lufs = 0;
}
}