mail-transaction-log-file.c revision da4376093d4e1b26b14ea1e945689fb7056fe0a0
/* Copyright (c) 2003-2014 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "array.h"
#include "ioloop.h"
#include "file-dotlock.h"
#include "nfs-workarounds.h"
#include "read-full.h"
#include "write-full.h"
#include "mmap-util.h"
#include "mail-index-private.h"
#include "mail-index-modseq.h"
#include "mail-transaction-log-private.h"
#define LOG_PREFETCH IO_BLOCK_SIZE
#define MEMORY_LOG_NAME "(in-memory transaction log file)"
#define LOG_NEW_DOTLOCK_SUFFIX ".newlock"
static int
static void
const char *function)
{
}
static void
{
unsigned int offset =
int flags;
return;
/* indexid=0 marks the log file as corrupted. we opened the file with
O_APPEND, and now we need to drop it for pwrite() to work (at least
in Linux) */
if (flags < 0) {
return;
}
return;
}
}
}
void
const char *fmt, ...)
{
T_BEGIN {
"Corrupted transaction log file %s seq %u: %s "
} T_END;
}
struct mail_transaction_log_file *
const char *path)
{
struct mail_transaction_log_file *file;
return file;
}
{
struct mail_transaction_log_file **p;
if (*p == file) {
break;
}
}
}
}
}
static void
{
const struct mail_index_modseq_header *modseq_hdr;
return;
/* we can get a valid log offset from index file. initialize
sync_offset from it so we don't have to read the whole log
file from beginning. */
"%s: log_file_head_offset too small",
/* modseqs not used yet */
file->sync_highest_modseq = 0;
} else if (modseq_hdr == NULL ||
/* highest_modseq not synced, start from beginning */
"%s: modseq_hdr.log_offset too large",
} else {
/* start from where we last stopped tracking modseqs */
}
}
}
static void
{
struct mail_transaction_log_file **p;
/* insert it to correct position */
break;
}
*p = file;
/* if we read any unfinished data, make sure the buffer gets
truncated. */
(void)mail_transaction_log_file_sync(file);
}
}
static int
struct mail_transaction_log_header *hdr)
{
struct mail_transaction_log_file *file;
#if !WORDS_BIGENDIAN
#endif
/* not creating index - make sure we have latest header */
if (mail_index_map(index,
MAIL_INDEX_SYNC_HANDLER_HEAD) <= 0)
return -1;
} else {
/* if we got here from mapping, the .log file is
corrupted. use whatever values we got from index
file */
}
}
} else {
}
/* make sure the sequence always increases to avoid crashes
later. this catches the buggy case where two processes
happen to replace the same log file. */
}
/* make sure the sequence grows */
}
/* this should be always up-to-date */
}
}
return 0;
}
struct mail_transaction_log_file *
{
struct mail_transaction_log_file *file;
return NULL;
}
return file;
}
static int
{
struct dotlock_settings dotlock_set;
int ret;
ret = 1;
else {
}
if (ret > 0) {
return 0;
}
if (ret < 0) {
return -1;
}
"Timeout (%us) while waiting for "
"dotlock for transaction log file %s",
return -1;
}
static int
{
int ret;
return 0;
if (ret < 0) {
return -1;
}
if (ret == 0) {
"Dotlock was lost for transaction log file %s",
return -1;
}
return 0;
}
{
unsigned int lock_timeout_secs;
int ret;
return 0;
return 0;
}
return mail_transaction_log_file_dotlock(file);
"Index is read-only, can't write-lock %s",
return -1;
}
if (ret > 0) {
return 0;
}
if (ret < 0) {
return -1;
}
"Timeout (%us) while waiting for lock for "
"transaction log file %s%s",
return -1;
}
{
unsigned int lock_time;
return;
return;
if (lock_time >= MAIL_TRANSACTION_LOG_LOCK_TIMEOUT) {
i_warning("Transaction log file %s was locked for %u seconds",
}
return;
}
}
static ssize_t
{
void *dest;
/* just read the entire transaction log to memory.
note that if some of the data hasn't been fully committed
yet (hdr.size=0), the buffer must be truncated later */
file->buffer_offset = 0;
} else {
/* read only the header */
}
/* it's not necessarily an error to read less than wanted header size,
since older versions of the log format used smaller headers. */
pos = 0;
do {
if (ret > 0)
}
}
static int
{
int ret;
/* mark the old file corrupted. we can't safely remove
it from the list however, so return failure. */
/* only mark .2 corrupted, just to make sure we don't lose any
changes from .log in case we're somehow wrong */
ret = 0;
} else {
ret = -1;
}
"Transaction log %s: "
"duplicate transaction log sequence (%u)",
}
return ret;
}
static int
bool ignore_estale)
{
struct mail_transaction_log_file *f;
int ret;
return 0;
if (ret < 0) {
return -1;
}
/* incompatible version - fix silently */
return 0;
}
if (ret < MAIL_TRANSACTION_LOG_HEADER_MIN_SIZE) {
"unexpected end of file while reading header");
return 0;
}
/* we have compatibility flags */
enum mail_index_header_compat_flags compat_flags = 0;
#if !WORDS_BIGENDIAN
#endif
/* architecture change */
"Rebuilding index file %s: "
"CPU architecture changed",
return 0;
}
}
"Header size too small");
return 0;
}
/* @UNSAFE: smaller than we expected - zero out the fields we
shouldn't have filled */
}
/* corrupted */
"Transaction log file %s: marked corrupted",
return 0;
}
/* index file was probably just rebuilt and we don't
know about it yet */
"indexid changed %u -> %u",
return 0;
}
/* creating index file. since transaction log is created
first, use the indexid in it to create the main index
to avoid races. */
}
/* make sure we already don't have a file with the same sequence
opened. it shouldn't happen unless the old log file was
corrupted. */
/* old "f" is the .log.2 */
return mail_transaction_log_file_fail_dupe(f);
} else {
/* new "file" is probably the .log.2 */
return mail_transaction_log_file_fail_dupe(file);
}
}
}
return 1;
}
static int
bool ignore_estale)
{
return -1;
}
return 0;
}
static bool
{
struct mail_transaction_log_file *tmp;
return TRUE;
}
return FALSE;
}
{
const struct mail_index_registered_ext *rext;
struct mail_transaction_header *hdr;
struct mail_transaction_ext_intro *intro;
struct mail_transaction_ext_hdr_update *ext_hdr;
unsigned int hdr_offset;
/* introduce the extension */
/* add the extension header data */
}
static int
{
const char *path2;
bool rename_existing;
return -1;
}
/* although we check also mtime and file size below, it's done
only to fix broken log files. we don't bother flushing
attribute cache just for that. */
}
/* log creation is locked now - see if someone already created it.
note that if we're rotating, we need to keep the log locked until
the file has been rewritten. and because fcntl() locks are stupid,
if we go and open()+close() the file and we had it already opened,
its locks are lost. so we use stat() to check if the file has been
recreated, although it almost never is. */
if (reset)
return -1;
}
but not when we're replacing a broken log file */
/* no-one else recreated the file */
} else {
/* recreated. use the file if its header is ok */
if (fd == -1) {
return -1;
}
} else {
FALSE) > 0 &&
/* yes, it was ok */
return 0;
}
}
}
/* creating the initial index */
}
return -1;
if (reset) {
/* don't reset modseqs. if we're reseting due to rebuilding
indexes we'll probably want to keep uidvalidity and in such
cases we really don't want to shrink modseqs. */
}
return -1;
}
/* the header isn't important, so don't bother calling
fdatasync() unless it's required */
return -1;
}
}
/* we'll need to preserve the lock */
if (mail_transaction_log_file_lock(file) < 0)
ret = -1;
}
/* if we return -1 the dotlock deletion code closes the fd */
if (ret < 0)
return -1;
/* keep two log files */
if (rename_existing) {
/* rename() would be nice and easy way to do this, except then
there's a race condition between the rename and
file_dotlock_replace(). during that time the log file
doesn't exist, which could cause problems. */
path2);
/* try to link() anyway */
}
/* ignore the error. we don't care that much about the
second log file and we're going to overwrite this
first one. */
}
/* NOTE: here's a race condition where both .log and .log.2
point to the same file. our reading code should ignore that
though by comparing the inodes. */
}
return -1;
/* success */
return 0;
}
bool reset)
{
struct dotlock_settings new_dotlock_set;
int fd;
"Can't create log file %s: Index is read-only",
return -1;
}
/* With dotlocking we might already have path.lock created, so this
filename has to be different. */
if (fd == -1) {
return -1;
}
/* either fd gets used or the dotlock gets deleted and returned fd
is for the existing file */
return -1;
}
return 0;
}
{
unsigned int i;
bool ignore_estale;
int ret;
for (i = 0;; i++) {
} else {
}
}
return 0;
return -1;
}
ret = -1;
else if (mail_transaction_log_file_is_dupe(file)) {
/* probably our already opened .log file has been
renamed to .log.2 and we're trying to reopen it.
also possible that hit a race condition where .log
and .log.2 are linked. */
return 0;
} else {
}
if (ret > 0) {
/* success */
break;
}
if (ret == 0) {
/* corrupted */
/* don't delete */
"unlink(%s) failed: %m",
}
return 0;
}
i == MAIL_INDEX_ESTALE_RETRY_COUNT) {
/* syscall error */
return -1;
}
/* ESTALE - try again */
}
return 1;
}
static int
{
const struct mail_transaction_header_update *u = data;
const struct mail_index_header *ihdr;
const unsigned int offset_pos =
"header update extends beyond record size");
return -1;
}
if (u->offset <= offset_pos &&
sizeof(tail_offset));
/* ignore shrinking tail offsets */
return 1;
} else {
return 1;
}
}
return 0;
}
{
i_assert(trans_size != 0);
if (*cur_modseq != 0) {
/* tracking modseqs */
/* modseqs not tracked yet. see if this is a modseq
extension introduction. */
const unsigned int modseq_ext_len =
modseq_ext_len) == 0) {
/* modseq tracking started */
*cur_modseq += 1;
}
return;
} else {
/* not tracking modseqs */
return;
}
/* ignore expunge requests */
break;
}
case MAIL_TRANSACTION_APPEND:
/* these changes increase modseq */
*cur_modseq += 1;
break;
case MAIL_TRANSACTION_MODSEQ_UPDATE: {
if (*cur_modseq < modseq)
*cur_modseq = modseq;
}
}
}
}
static struct modseq_cache *
{
struct modseq_cache cache;
if (idx > 0) {
/* @UNSAFE: move it to top */
}
return &file->modseq_cache[0];
}
static struct modseq_cache *
{
continue;
return NULL;
/* exact cache hit */
return modseq_cache_hit(file, i);
}
best = i;
}
return NULL;
}
static struct modseq_cache *
{
continue;
return NULL;
/* exact cache hit */
return modseq_cache_hit(file, i);
}
best = i;
}
return NULL;
}
static int
const struct mail_transaction_header **hdr_r)
{
const struct mail_transaction_header *hdr;
/* we've already synced this record at some point. it should
be valid. */
if (trans_size < sizeof(*hdr) ||
"Transaction log corrupted unexpectedly at "
return -1;
}
*offset += trans_size;
return 0;
}
struct mail_transaction_log_file *file,
{
const struct mail_transaction_header *hdr;
struct modseq_cache *cache;
int ret;
return 0;
}
/* nothing usable in cache - scan from beginning */
/* exact cache hit */
return 0;
} else {
/* use cache to skip over some records */
}
if (ret <= 0) {
if (ret < 0)
return -1;
"%s: Transaction log corrupted, can't get modseq",
return -1;
}
while (cur_offset < offset) {
return- 1;
}
/* @UNSAFE: cache the value */
sizeof(*file->modseq_cache) *
return 0;
}
struct mail_transaction_log_file *file,
{
const struct mail_transaction_header *hdr;
struct modseq_cache *cache;
int ret;
return 0;
}
return 0;
}
/* nothing usable in cache - scan from beginning */
/* exact cache hit */
return 0;
} else {
/* use cache to skip over some records */
}
file->sync_offset);
if (ret <= 0) {
if (ret < 0)
return -1;
"%s: Transaction log corrupted, can't get modseq",
return -1;
}
return -1;
if (cur_modseq >= modseq)
break;
}
/* if we got to sync_offset, cur_modseq should be
sync_highest_modseq */
"%s: Transaction log changed unexpectedly, "
return -1;
}
/* @UNSAFE: cache the value */
sizeof(*file->modseq_cache) *
return 0;
}
static int
const struct mail_transaction_header *hdr,
unsigned int trans_size)
{
int ret;
return 1;
/* external transactions: */
/* see if this updates mailbox_sync_offset */
sizeof(*hdr));
if (ret != 0)
break;
break;
break;
break;
break;
case MAIL_TRANSACTION_BOUNDARY: {
const struct mail_transaction_boundary *boundary =
(const void *)(hdr + 1);
/* the full transaction hasn't been written yet */
return 0;
}
break;
}
}
/* external transactions aren't synced to mailbox. we can
update mailbox sync offset to skip this transaction to
avoid re-reading it at the next sync. */
}
return 1;
}
static int
{
const struct mail_transaction_header *hdr;
const void *data;
uint32_t trans_size = 0;
int ret;
return -1;
}
if (trans_size == 0) {
/* unfinished */
return 1;
}
if (trans_size < sizeof(*hdr)) {
"hdr.size too small (%u)", trans_size);
return -1;
}
break;
/* transaction has been fully written */
if (ret < 0)
return -1;
break;
}
}
/* Now that all the mmaped pages have page faulted, check if
the file had changed while doing that. Only after the last
page has faulted, the size returned by fstat() can be
trusted. Otherwise it might point to a page boundary while
the next page is still being written.
Without this check we might see partial transactions,
sometimes causing "Extension record updated without intro
prefix" errors. */
return -1;
}
return 0;
}
}
/* There's more data than we could sync at the moment. If the
last record's size wasn't valid, we can't know if it will
be updated unless we've locked the log. */
"Unexpected garbage at EOF");
return -1;
}
/* The size field will be updated soon */
}
"Invalid transaction log size "
return -1;
}
return 1;
}
static int
{
void *data;
if (ret > 0) {
/* success */
return 1;
}
/* failure. don't leave ourself to inconsistent state */
if (ret == 0) {
return 0;
/* log file was deleted in NFS server, fail silently */
return 0;
} else {
return -1;
}
}
static int
{
void *data;
do {
if (ret > 0)
read_offset += ret;
if (ret < 0) {
/* log file was deleted in NFS server, fail silently */
return 0;
}
return -1;
}
return 1;
}
static bool
{
/* we already have a newer log file which says that we haven't
synced the entire file. */
return TRUE;
}
return TRUE;
return FALSE;
}
static int
{
int ret;
/* NFS: if file isn't locked, we're optimistic that we can read enough
data without flushing attribute cache. if after reading we notice
that we really should have read more, flush the cache and try again.
if file is locked, the attribute cache was already flushed when
refreshing the log. */
else
}
/* we have to insert missing data to beginning of buffer */
if (ret <= 0)
return ret;
}
}
;
/* we didn't read enough data. flush and try again. */
} else {
}
return ret;
}
static int
{
/* broken start offset */
return 0;
}
return 0;
}
return 1;
}
static int
{
/* in case we just switched to mmaping */
}
return -1;
}
MADV_SEQUENTIAL) < 0)
}
file->buffer_offset = 0;
return 0;
}
static void
{
return;
}
static int
{
int ret;
/* we are going to mmap() this file, but it's not necessarily
mmaped currently. */
return -1;
}
return 0;
}
/* we already have the whole file mapped */
return 0;
if (ret > 0)
return 1;
/* size changed, re-mmap */
}
do {
/* just reading the file is probably faster */
return mail_transaction_log_file_read(file,
FALSE);
}
if (mail_transaction_log_file_mmap(file) < 0)
return -1;
return 0;
} while (ret == 0);
return 1;
}
{
int ret;
/* corrupted */
return 0;
}
/* we're not interested of going further than sync_offset */
end_offset) == 0)
return 0;
}
/* see if we already have it */
return 1;
}
/* set this only when we've synced to end of file while locked
(either end_offset=(uoff_t)-1 or we had to read anyway) */
}
/* we had moved the log to memory but failed to read
the beginning of the log file */
"%s: Beginning of the log isn't available",
return 0;
}
}
/* although we could just skip over the unwanted data, we have
to sync everything so that modseqs are calculated
correctly */
}
else {
}
if (ret <= 0)
return ret;
}
*file)
{
return;
/* just copy to memory */
/* and lose the mmap */
} else if (file->buffer_offset != 0) {
/* we don't have the full log in the memory. read it. */
}
}