journal-file.c revision 65eae3b76243d2dfd869f8c43b787575f7b4b994
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier This file is part of systemd.
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier Copyright 2011 Lennart Poettering
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier systemd is free software; you can redistribute it and/or modify it
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier under the terms of the GNU Lesser General Public License as published by
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier the Free Software Foundation; either version 2.1 of the License, or
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier (at your option) any later version.
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier systemd is distributed in the hope that it will be useful, but
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier WITHOUT ANY WARRANTY; without even the implied warranty of
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier Lesser General Public License for more details.
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier You should have received a copy of the GNU Lesser General Public License
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier along with systemd; If not, see <http://www.gnu.org/licenses/>.
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
c1d630d5fd3c0b3307811d51f9840652e066a0f2Zbigniew Jędrzejewski-Szmek#define COMPRESSION_SIZE_THRESHOLD (512ULL)
c1d630d5fd3c0b3307811d51f9840652e066a0f2Zbigniew Jędrzejewski-Szmek/* This is the minimum journal file size */
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier/* These are the lower and upper bounds if we deduce the max_use value
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier * from the file system size */
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
1f532d7ef35210f11e75cfcab0535e65a37901f3Ronny Chevalier/* This is the upper bound if we deduce max_size from max_use */
1f532d7ef35210f11e75cfcab0535e65a37901f3Ronny Chevalier#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
e6b5c5d03cb28d2149dc1c124c2a315911b91f4fRonny Chevalier/* This is the upper bound if we deduce the keep_free value from the
assert(f);
if (!f->writable)
return -EPERM;
return -EINVAL;
return -EIO;
case STATE_ONLINE:
case STATE_OFFLINE:
return -EINVAL;
assert(f);
if (!f->writable)
return -EPERM;
return -EINVAL;
return -EIO;
return -EIO;
assert(f);
#ifdef HAVE_GCRYPT
if (f->mmap)
#ifdef HAVE_GCRYPT
if (f->fss_file)
else if (f->fsprg_state)
if (f->hmac)
free(f);
Header h = {};
ssize_t k;
assert(f);
if (template) {
return -errno;
return -EIO;
assert(f);
f->tail_entry_monotonic_valid = true;
r = journal_file_set_online(f);
assert(f);
return -EBADMSG;
if (flags)
return -EPROTONOSUPPORT;
if (flags)
return -EPROTONOSUPPORT;
return -EBADMSG;
return -EBADMSG;
return -EBADMSG;
if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
return -ENODATA;
if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
return -ENODATA;
return -ENODATA;
if (f->writable) {
return -EHOSTDOWN;
return -EBUSY;
return -ESHUTDOWN;
return -EBUSY;
assert(f);
return -errno;
return -EIDRM;
assert(f);
return -EIO;
old_size =
return journal_file_fstat(f);
return -E2BIG;
available = 0;
return -E2BIG;
return journal_file_fstat(f);
static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
assert(f);
if (size <= 0)
return -EINVAL;
r = journal_file_fstat(f);
return -EADDRNOTAVAIL;
return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
return sizeof(ObjectHeader);
Object *o;
uint64_t s;
assert(f);
return -EFAULT;
o = (Object*) t;
if (s < sizeof(ObjectHeader))
return -EBADMSG;
return -EBADMSG;
if (s < minimum_header_size(o))
return -EBADMSG;
return -EBADMSG;
if (s > sizeof(ObjectHeader)) {
o = (Object*) t;
*ret = o;
uint64_t r;
assert(f);
if (seqnum) {
*seqnum = r;
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
uint64_t p;
assert(f);
r = journal_file_set_online(f);
o = (Object*) t;
*ret = o;
*offset = p;
uint64_t s, p;
Object *o;
assert(f);
if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
r = journal_file_append_object(f,
uint64_t s, p;
Object *o;
assert(f);
r = journal_file_append_object(f,
uint64_t s, p;
assert(f);
r = journal_file_move_to(f,
f->data_hash_table = t;
uint64_t s, p;
assert(f);
r = journal_file_move_to(f,
f->field_hash_table = t;
static int journal_file_link_field(
JournalFile *f,
Object *o,
uint64_t p, h, m;
assert(f);
assert(o);
return -EINVAL;
return -EBADMSG;
h = hash % m;
static int journal_file_link_data(
JournalFile *f,
Object *o,
uint64_t p, h, m;
assert(f);
assert(o);
return -EINVAL;
return -EBADMSG;
h = hash % m;
JournalFile *f,
assert(f);
return -EBADMSG;
h = hash % m;
Object *o;
if (ret)
*ret = o;
if (offset)
*offset = p;
JournalFile *f,
assert(f);
return journal_file_find_field_object_with_hash(f,
JournalFile *f,
assert(f);
return -EBADMSG;
h = hash % m;
Object *o;
goto next;
uint64_t l;
return -EBADMSG;
if (ret)
*ret = o;
if (offset)
*offset = p;
return -EPROTONOSUPPORT;
if (ret)
*ret = o;
if (offset)
*offset = p;
next:
JournalFile *f,
assert(f);
return journal_file_find_data_object_with_hash(f,
static int journal_file_append_field(
JournalFile *f,
Object *o;
assert(f);
if (ret)
*ret = o;
if (offset)
*offset = p;
#ifdef HAVE_GCRYPT
if (ret)
*ret = o;
if (offset)
*offset = p;
static int journal_file_append_data(
JournalFile *f,
Object *o;
int r, compression = 0;
const void *eq;
assert(f);
if (ret)
*ret = o;
if (offset)
*offset = p;
if (f->compress_xz &&
if (compression) {
if (!data)
#ifdef HAVE_GCRYPT
if (ret)
*ret = o;
if (offset)
*offset = p;
assert(o);
assert(o);
assert(o);
uint64_t p) {
Object *o;
assert(f);
assert(p > 0);
n = journal_file_entry_array_n_items(o);
ap = a;
if (hidx > n)
#ifdef HAVE_GCRYPT
if (ap == 0)
uint64_t p) {
assert(f);
assert(p > 0);
if (*idx == 0)
le64_t i;
uint64_t p;
assert(f);
assert(o);
return -EINVAL;
return link_entry_into_array_plus_one(f,
offset);
uint64_t n, i;
assert(f);
assert(o);
return -EINVAL;
r = link_entry_into_array(f,
offset);
/* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
f->tail_entry_monotonic_valid = true;
n = journal_file_entry_n_items(o);
static int journal_file_append_entry_internal(
JournalFile *f,
Object *o;
assert(f);
#ifdef HAVE_GCRYPT
if (ret)
*ret = o;
if (offset)
assert(f);
int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
assert(f);
if (!ts) {
if (f->tail_entry_monotonic_valid &&
return -EINVAL;
#ifdef HAVE_GCRYPT
for (i = 0; i < n_iovec; i++) {
uint64_t p;
Object *o;
r = -EIO;
typedef struct ChainCacheItem {
static void chain_cache_put(
OrderedHashmap *h,
if (!ci) {
if (!ci)
static int generic_array_get(
JournalFile *f,
uint64_t i,
Object *o;
uint64_t p = 0, a, t = 0;
assert(f);
a = first;
uint64_t k;
k = journal_file_entry_array_n_items(o);
goto found;
if (ret)
*ret = o;
if (offset)
*offset = p;
static int generic_array_get_plus_one(
JournalFile *f,
uint64_t i,
Object *o;
assert(f);
if (ret)
*ret = o;
if (offset)
static int generic_array_bisect(
JournalFile *f,
uint64_t n,
bool subtract_one = false;
assert(f);
a = first;
if (r == TEST_LEFT) {
if (right <= 0)
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT) {
left = 0;
if (last_index > 0) {
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = x;
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = y;
subtract_one = true;
i = left;
goto found;
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = i;
subtract_one = true;
goto found;
if (subtract_one && t == 0 && i == 0)
chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
if (subtract_one && i == 0)
p = last_p;
else if (subtract_one)
if (ret)
*ret = o;
if (offset)
*offset = p;
if (idx)
static int generic_array_bisect_plus_one(
JournalFile *f,
uint64_t n,
bool step_back = false;
Object *o;
assert(f);
if (r == TEST_FOUND)
if (r == TEST_LEFT)
if (r == TEST_RIGHT) {
goto found;
if (r == 0 && step_back)
goto found;
if (r > 0 && idx)
(*idx) ++;
if (ret)
*ret = o;
if (offset)
if (idx)
*idx = 0;
assert(f);
assert(p > 0);
if (p == needle)
return TEST_FOUND;
else if (p < needle)
return TEST_LEFT;
return TEST_RIGHT;
Object *o;
assert(f);
assert(p > 0);
return TEST_FOUND;
return TEST_LEFT;
return TEST_RIGHT;
JournalFile *f,
return generic_array_bisect(f,
Object *o;
assert(f);
assert(p > 0);
return TEST_FOUND;
return TEST_LEFT;
return TEST_RIGHT;
JournalFile *f,
return generic_array_bisect(f,
Object *o;
assert(f);
assert(p > 0);
return TEST_FOUND;
return TEST_LEFT;
return TEST_RIGHT;
static int find_data_object_by_boot_id(
JournalFile *f,
Object **o,
uint64_t *b) {
JournalFile *f,
Object *o;
assert(f);
return -ENOENT;
return generic_array_bisect_plus_one(f,
f->current_offset = 0;
f->current_seqnum = 0;
f->current_realtime = 0;
f->current_monotonic = 0;
f->current_xor_hash = 0;
JournalFile *f,
uint64_t p,
assert(f);
r = generic_array_bisect(f,
r = generic_array_get(f,
f->path, i);
return -EBADMSG;
if (offset)
JournalFile *f,
uint64_t n, i;
Object *d;
assert(f);
assert(p > 0 || !o);
return -EINVAL;
return generic_array_get_plus_one(f,
JournalFile *f,
uint64_t p,
Object *d;
assert(f);
return generic_array_bisect_plus_one(f,
JournalFile *f,
Object *o, *d;
uint64_t b, z;
assert(f);
return -ENOENT;
uint64_t p, q;
if (ret)
if (offset)
*offset = q;
JournalFile *f,
Object *d;
assert(f);
return generic_array_bisect_plus_one(f,
JournalFile *f,
Object *d;
assert(f);
return generic_array_bisect_plus_one(f,
Object *o;
uint64_t p;
assert(f);
goto fail;
case OBJECT_UNUSED:
case OBJECT_DATA:
case OBJECT_FIELD:
case OBJECT_ENTRY:
case OBJECT_FIELD_HASH_TABLE:
case OBJECT_DATA_HASH_TABLE:
case OBJECT_ENTRY_ARRAY:
case OBJECT_TAG:
fail:
assert(f);
f->path,
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
int journal_file_open(
const char *fname,
int flags,
bool compress,
bool seal,
bool newly_created = false;
JournalFile *f;
return -EINVAL;
return -EINVAL;
return -ENOMEM;
#if defined(HAVE_LZ4)
#ifdef HAVE_GCRYPT
if (mmap_cache)
if (!f->mmap) {
r = -ENOMEM;
goto fail;
if (!f->path) {
r = -ENOMEM;
goto fail;
if (!f->chain_cache) {
r = -ENOMEM;
goto fail;
if (f->fd < 0) {
r = -errno;
goto fail;
r = journal_file_fstat(f);
goto fail;
if (r < 0 && r != -ENOTTY)
#ifdef HAVE_GCRYPT
if (f->seal) {
r = journal_file_fss_load(f);
f->seal = false;
goto fail;
r = journal_file_fstat(f);
goto fail;
newly_created = true;
r = -EIO;
goto fail;
r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
r = -errno;
goto fail;
f->header = h;
if (!newly_created) {
r = journal_file_verify_header(f);
goto fail;
#ifdef HAVE_GCRYPT
r = journal_file_fss_load(f);
goto fail;
if (f->writable) {
if (metrics) {
} else if (template)
r = journal_file_refresh_header(f);
goto fail;
#ifdef HAVE_GCRYPT
r = journal_file_hmac_setup(f);
goto fail;
if (newly_created) {
goto fail;
goto fail;
#ifdef HAVE_GCRYPT
r = journal_file_append_first_tag(f);
goto fail;
goto fail;
r = journal_file_map_data_hash_table(f);
goto fail;
r = -EIO;
goto fail;
*ret = f;
fail:
r = -EIO;
size_t l;
assert(f);
assert(*f);
old_file = *f;
return -EINVAL;
return -EINVAL;
return -ENOMEM;
return -errno;
r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
*f = new_file;
const char *fname,
int flags,
bool compress,
bool seal,
size_t l;
random_u64()) < 0)
return -ENOMEM;
return -errno;
(void) btrfs_defrag(p);
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
uint64_t i, n;
assert(o);
assert(p);
return -EPERM;
n = journal_file_entry_n_items(o);
uint64_t l, h;
size_t t;
void *data;
Object *u;
return -EBADMSG;
t = (size_t) l;
if ((uint64_t) t != l)
return -E2BIG;
l = rsize;
return -EPROTONOSUPPORT;
return -EIO;
assert(m);
if (fs_size > 0) {
if (fs_size > 0) {
assert(f);
if (from) {
return -ENOENT;
if (to) {
return -ENOENT;
int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
Object *o;
uint64_t p;
assert(f);
if (from) {
if (to) {
r = generic_array_get_plus_one(f,
&o, NULL);
assert(f);
if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
if (max_file_usec > 0) {
usec_t t, h;
if (h > 0 && t > h + max_file_usec)