/* Copyright (c) 2003-2018 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "array.h"
#include "ioloop.h"
#include "file-dotlock.h"
#include "nfs-workarounds.h"
#include "read-full.h"
#include "write-full.h"
#include "mmap-util.h"
#include "mail-index-private.h"
#include "mail-index-modseq.h"
#include "mail-transaction-log-private.h"
static int
static void
const char *function)
{
}
static void
{
unsigned int offset =
int flags;
return;
/* indexid=0 marks the log file as corrupted. we opened the file with
O_APPEND, and now we need to drop it for pwrite() to work (at least
in Linux) */
if (flags < 0) {
return;
}
return;
}
}
}
void
const char *fmt, ...)
{
T_BEGIN {
"Corrupted transaction log file %s seq %u: %s "
} T_END;
}
struct mail_transaction_log_file *
const char *path)
{
return file;
}
{
struct mail_transaction_log_file **p;
if (*p == file) {
break;
}
}
}
}
}
static void
{
return;
/* we can get a valid log offset from index file. initialize
sync_offset from it so we don't have to read the whole log
file from beginning. */
"%s: log_file_head_offset too small",
/* modseqs not used yet */
file->sync_highest_modseq = 0;
} else if (modseq_hdr == NULL ||
/* highest_modseq not synced, start from beginning */
"%s: modseq_hdr.log_offset too large",
} else {
/* start from where we last stopped tracking modseqs */
}
}
}
static void
{
struct mail_transaction_log_file **p;
const char *reason;
bool retry;
/* insert it to correct position */
break;
}
*p = file;
/* if we read any unfinished data, make sure the buffer gets
truncated. */
}
}
static int
struct mail_transaction_log_header *hdr)
{
#ifndef WORDS_BIGENDIAN
#endif
/* not creating index - make sure we have latest header */
if (mail_index_map(index,
MAIL_INDEX_SYNC_HANDLER_HEAD) <= 0)
return -1;
} else {
/* if we got here from mapping, the .log file is
corrupted. use whatever values we got from index
file */
}
}
} else {
}
if (hdr->initial_modseq == 0) {
/* modseq tracking in log files is required for many reasons
nowadays, even if per-message modseqs aren't enabled in
dovecot.index. */
}
/* make sure the sequence always increases to avoid crashes
later. this catches the buggy case where two processes
happen to replace the same log file. */
}
/* make sure the sequence grows */
}
/* this should be always up-to-date */
}
}
return 0;
}
struct mail_transaction_log_file *
{
return NULL;
}
return file;
}
static int
{
int ret;
ret = 1;
else {
}
if (ret > 0) {
return 0;
}
if (ret < 0) {
return -1;
}
"Timeout (%us) while waiting for "
"dotlock for transaction log file %s",
return -1;
}
static int
{
int ret;
return 0;
if (ret < 0) {
return -1;
}
if (ret == 0) {
"Dotlock was lost for transaction log file %s",
return -1;
}
return 0;
}
{
unsigned int lock_timeout_secs;
int ret;
return 0;
return 0;
}
return mail_transaction_log_file_dotlock(file);
"Index is read-only, can't write-lock %s",
return -1;
}
if (ret > 0) {
return 0;
}
if (ret < 0) {
return -1;
}
"Timeout (%us) while waiting for lock for "
"transaction log file %s%s",
return -1;
}
const char *lock_reason)
{
unsigned int lock_time;
return;
return;
i_warning("Transaction log file %s was locked for %u seconds (%s)",
}
return;
}
}
static ssize_t
{
void *dest;
/* just read the entire transaction log to memory.
note that if some of the data hasn't been fully committed
yet (hdr.size=0), the buffer must be truncated later */
file->buffer_offset = 0;
} else {
/* read only the header */
}
/* it's not necessarily an error to read less than wanted header size,
since older versions of the log format used smaller headers. */
pos = 0;
do {
if (ret > 0)
}
}
static int
{
int ret;
/* mark the old file corrupted. we can't safely remove
it from the list however, so return failure. */
/* only mark .2 corrupted, just to make sure we don't lose any
changes from .log in case we're somehow wrong */
ret = 0;
} else {
ret = -1;
}
"Transaction log %s: "
"duplicate transaction log sequence (%u)",
}
return ret;
}
static int
bool ignore_estale)
{
struct mail_transaction_log_file *f;
int ret;
return 0;
if (ret < 0) {
return -1;
}
/* incompatible version - fix silently */
return 0;
}
if (ret < MAIL_TRANSACTION_LOG_HEADER_MIN_SIZE) {
"unexpected end of file while reading header");
return 0;
}
const unsigned int hdr_version =
/* we have compatibility flags */
#ifndef WORDS_BIGENDIAN
#endif
/* architecture change */
"Rebuilding index file %s: "
"CPU architecture changed",
return 0;
}
}
"Header size too small");
return 0;
}
/* @UNSAFE: smaller than we expected - zero out the fields we
shouldn't have filled */
}
/* corrupted */
"Transaction log file %s: marked corrupted",
return 0;
}
/* index file was probably just rebuilt and we don't
know about it yet */
"indexid changed: %u -> %u",
return 0;
}
/* creating index file. since transaction log is created
first, use the indexid in it to create the main index
to avoid races. */
}
/* make sure we already don't have a file with the same sequence
opened. it shouldn't happen unless the old log file was
corrupted. */
/* old "f" is the .log.2 */
return mail_transaction_log_file_fail_dupe(f);
} else {
/* new "file" is probably the .log.2 */
return mail_transaction_log_file_fail_dupe(file);
}
}
}
return 1;
}
static int
bool ignore_estale)
{
return -1;
}
return 0;
}
static bool
{
return TRUE;
}
return FALSE;
}
{
unsigned int hdr_offset;
/* introduce the extension */
/* add the extension header data */
}
static int
{
const char *path2;
return -1;
}
/* although we check also mtime and file size below, it's done
only to fix broken log files. we don't bother flushing
attribute cache just for that. */
}
/* log creation is locked now - see if someone already created it.
note that if we're rotating, we need to keep the log locked until
the file has been rewritten. and because fcntl() locks are stupid,
if we go and open()+close() the file and we had it already opened,
its locks are lost. so we use stat() to check if the file has been
recreated, although it almost never is. */
if (reset)
return -1;
}
but not when we're replacing a broken log file */
/* no-one else recreated the file */
} else {
/* recreated. use the file if its header is ok */
if (fd == -1) {
return -1;
}
} else {
FALSE) > 0 &&
/* yes, it was ok */
return 0;
}
}
}
/* creating the initial index */
}
return -1;
if (reset) {
/* don't reset modseqs. if we're reseting due to rebuilding
indexes we'll probably want to keep uidvalidity and in such
cases we really don't want to shrink modseqs. */
}
return -1;
}
/* the header isn't important, so don't bother calling
fdatasync() unless it's required */
return -1;
}
}
/* we'll need to preserve the lock */
if (mail_transaction_log_file_lock(file) < 0)
ret = -1;
}
/* if we return -1 the dotlock deletion code closes the fd */
if (ret < 0)
return -1;
/* keep two log files */
if (rename_existing) {
/* rename() would be nice and easy way to do this, except then
there's a race condition between the rename and
file_dotlock_replace(). during that time the log file
doesn't exist, which could cause problems. */
if (i_unlink_if_exists(path2) < 0) {
/* try to link() anyway */
}
/* ignore the error. we don't care that much about the
second log file and we're going to overwrite this
first one. */
}
/* NOTE: here's a race condition where both .log and .log.2
point to the same file. our reading code should ignore that
though by comparing the inodes. */
}
DOTLOCK_REPLACE_FLAG_DONT_CLOSE_FD) <= 0) {
/* need to unlock to avoid assert-crash in
mail_transaction_log_file_free() */
return -1;
}
/* success */
return 1;
}
bool reset)
{
"Can't create log file %s: Index is read-only",
return -1;
}
"Can't create log file %s: Index is marked corrupted",
return -1;
}
/* With dotlocking we might already have path.lock created, so this
filename has to be different. */
if (fd == -1) {
return -1;
}
/* either fd gets used or the dotlock gets deleted and returned fd
is for the existing file */
if (ret < 0) {
return -1;
}
return ret;
}
const char **reason_r)
{
unsigned int i;
bool ignore_estale;
int ret;
for (i = 0;; i++) {
} else {
}
}
*reason_r = "File doesn't exist";
return 0;
}
return -1;
}
ret = -1;
else if (mail_transaction_log_file_is_dupe(file)) {
/* probably our already opened .log file has been
renamed to .log.2 and we're trying to reopen it.
also possible that hit a race condition where .log
and .log.2 are linked. */
*reason_r = "File is already open";
return 0;
} else {
}
if (ret > 0) {
/* success */
break;
}
if (ret == 0) {
/* corrupted */
/* don't delete */
} else {
}
*reason_r = "File is corrupted";
return 0;
}
i == MAIL_INDEX_ESTALE_RETRY_COUNT) {
/* syscall error */
return -1;
}
/* ESTALE - try again */
}
return 1;
}
static int
const void *data, unsigned int trans_size,
const char **error_r)
{
const struct mail_transaction_header_update *u = data;
const unsigned int offset_pos =
*error_r = "header update extends beyond record size";
return -1;
}
if (u->offset <= offset_pos &&
sizeof(tail_offset));
/* ignore shrinking tail offsets */
return 1;
"log_file_tail_offset %u goes past sync offset %"PRIuUOFF_T,
} else {
return 1;
}
}
return 0;
}
static bool
{
/* Hide internal flags from modseqs if the log file's version
is new enough. This allows upgrading without the modseqs suddenly
shrinking. */
return TRUE;
for (unsigned int i = 0; i < count; i++) {
if (!MAIL_TRANSACTION_FLAG_UPDATE_IS_INTERNAL(&u[i]))
return TRUE;
}
return FALSE;
}
unsigned int version)
{
i_assert(trans_size != 0);
if (*cur_modseq != 0) {
/* tracking modseqs */
/* modseqs not tracked yet. see if this is a modseq
extension introduction. */
const unsigned int modseq_ext_len =
modseq_ext_len) == 0) {
/* modseq tracking started */
*cur_modseq += 1;
}
return;
} else {
/* not tracking modseqs */
return;
}
/* NOTE: keep in sync with mail_index_transaction_get_highest_modseq() */
/* ignore expunge requests */
break;
}
/* fall through */
case MAIL_TRANSACTION_APPEND:
/* these changes increase modseq */
*cur_modseq += 1;
break;
case MAIL_TRANSACTION_FLAG_UPDATE: {
unsigned int count;
*cur_modseq += 1;
break;
}
case MAIL_TRANSACTION_MODSEQ_UPDATE: {
if (*cur_modseq < modseq)
*cur_modseq = modseq;
}
}
}
}
static struct modseq_cache *
{
if (idx > 0) {
/* @UNSAFE: move it to top */
}
return &file->modseq_cache[0];
}
static struct modseq_cache *
{
continue;
return NULL;
/* exact cache hit */
return modseq_cache_hit(file, i);
}
best = i;
}
return NULL;
}
static struct modseq_cache *
{
continue;
return NULL;
/* exact cache hit */
return modseq_cache_hit(file, i);
}
best = i;
}
return NULL;
}
static int
const struct mail_transaction_header **hdr_r,
const char **error_r)
{
/* we've already synced this record at some point. it should
be valid. */
if (trans_size < sizeof(*hdr) ||
"Transaction log corrupted unexpectedly at "
return -1;
}
*offset += trans_size;
return 0;
}
struct mail_transaction_log_file *file,
const char **error_r)
{
const char *reason;
int ret;
return 0;
}
/* nothing usable in cache - scan from beginning */
/* exact cache hit */
return 0;
} else {
/* use cache to skip over some records */
}
if (ret <= 0) {
"Failed to map transaction log %s for getting modseq "
return -1;
}
while (cur_offset < offset) {
return- 1;
}
/* @UNSAFE: cache the value */
sizeof(*file->modseq_cache) *
return 0;
}
static int
{
const char *reason;
int ret;
/* make sure we've read until end of file. this is especially important
with non-head logs which might only have been opened without being
synced. */
if (ret <= 0) {
"Failed to map transaction log %s for getting offset "
return -1;
}
/* check sync_highest_modseq again in case sync_offset was updated */
return 0;
}
return -1;
}
if (*cur_modseq >= modseq)
break;
}
return 1;
}
struct mail_transaction_log_file *file,
{
int ret;
return 0;
}
return 0;
}
/* nothing usable in cache - scan from beginning */
/* exact cache hit */
return 0;
} else {
/* use cache to skip over some records */
}
&cur_modseq, next_offset_r)) <= 0)
return ret;
/* if we got to sync_offset, cur_modseq should be
sync_highest_modseq */
"%s: Transaction log modseq tracking is corrupted - fixing",
/* retry getting the offset by reading from the beginning
of the file */
&cur_offset, &cur_modseq,
if (ret < 0)
return -1;
/* get it fixed on the next sync */
/* clear cache, since it's unreliable */
}
/* @UNSAFE: cache the value */
sizeof(*file->modseq_cache) *
return 0;
}
static int
const struct mail_transaction_header *hdr,
unsigned int trans_size, const char **error_r)
{
int ret;
return 1;
/* external transactions: */
/* see if this updates mailbox_sync_offset */
if (ret != 0)
break;
break;
break;
break;
break;
case MAIL_TRANSACTION_BOUNDARY: {
(const void *)(hdr + 1);
/* the full transaction hasn't been written yet */
return 0;
}
break;
}
}
/* external transactions aren't synced to mailbox. we can
update mailbox sync offset to skip this transaction to
avoid re-reading it at the next sync. */
}
return 1;
}
static int
{
const void *data;
int ret;
/* fix the sync_offset to avoid crashes later on */
return 0;
}
if (trans_size == 0) {
/* unfinished */
return 1;
}
if (trans_size < sizeof(*hdr)) {
"hdr.size too small (%u)", trans_size);
return 0;
}
break;
/* transaction has been fully written */
if (ret < 0)
return 0;
break;
}
}
/* Now that all the mmaped pages have page faulted, check if
the file had changed while doing that. Only after the last
page has faulted, the size returned by fstat() can be
trusted. Otherwise it might point to a page boundary while
the next page is still being written.
Without this check we might see partial transactions,
sometimes causing "Extension record updated without intro
prefix" errors. */
return -1;
}
*reason_r = "File size changed - retrying";
return 0;
}
}
/* There's more data than we could sync at the moment. If the
last record's size wasn't valid, we can't know if it will
be updated unless we've locked the log. */
*reason_r = "Unexpected garbage at EOF";
return 0;
}
/* The size field will be updated soon */
}
"Invalid transaction log size "
return 0;
}
return 1;
}
static int
{
void *data;
if (ret > 0) {
/* success */
return 1;
}
/* failure. don't leave ourself to inconsistent state */
if (ret == 0) {
*reason_r = "file shrank unexpectedly";
return 0;
/* log file was deleted in NFS server, fail silently */
return 0;
} else {
return -1;
}
}
static int
const char **reason_r)
{
void *data;
do {
if (ret > 0)
read_offset += ret;
if (ret < 0) {
/* log file was deleted in NFS server, fail silently */
return 0;
}
return -1;
}
return 1;
}
static bool
{
/* we already have a newer log file which says that we haven't
synced the entire file. */
return TRUE;
}
return TRUE;
return FALSE;
}
static int
const char **reason_r)
{
bool retry;
int ret;
/* NFS: if file isn't locked, we're optimistic that we can read enough
data without flushing attribute cache. if after reading we notice
that we really should have read more, flush the cache and try again.
if file is locked, the attribute cache was already flushed when
refreshing the log. */
else
}
/* we have to insert missing data to beginning of buffer */
if (ret <= 0)
return ret;
}
}
;
/* we didn't read enough data. flush and try again. */
}
return ret;
}
static bool
const char **reason_r)
{
/* broken start offset */
return FALSE;
}
}
"%s, file unexpectedly replaced", *reason_r);
}
"%s, file unexpectedly deleted", *reason_r);
} else {
}
return FALSE;
}
return FALSE;
}
return TRUE;
}
static int
const char **reason_r)
{
/* we may have switched to mmaping */
}
return -1;
}
MADV_SEQUENTIAL) < 0)
}
file->buffer_offset = 0;
return 0;
}
static void
{
return;
}
static int
{
bool retry;
int ret;
/* we are going to mmap() this file, but it's not necessarily
mmaped currently. */
return -1;
}
return 0;
}
/* we already have the whole file mapped */
!retry)
return ret;
/* size changed, re-mmap */
}
do {
/* just reading the file is probably faster */
return mail_transaction_log_file_read(file,
}
return -1;
} while (retry);
return ret;
}
const char **reason_r)
{
int ret;
/* corrupted */
*reason_r = "corrupted, indexid=0";
return 0;
}
/* we're not interested of going further than sync_offset */
return 0;
}
/* see if we already have it */
return 1;
}
/* set this only when we've synced to end of file while locked
(either end_offset=(uoff_t)-1 or we had to read anyway) */
}
/* we had moved the log to memory but failed to read
the beginning of the log file */
*reason_r = "Beginning of the log isn't available";
return 0;
}
}
/* although we could just skip over the unwanted data, we have
to sync everything so that modseqs are calculated
correctly */
}
else {
}
if (ret <= 0)
return ret;
reason_r) ? 1 : 0;
}
{
const char *error;
int ret = 0;
return 0;
/* just copy to memory */
/* and lose the mmap */
} else if (file->buffer_offset != 0) {
/* we don't have the full log in the memory. read it. */
if (ret <= 0) {
}
}
return ret < 0 ? -1 : 0;
}