journal-file.c revision fc68c92973e5437ee0489c1bc80d80f0a7b6ca0b
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer This file is part of systemd.
fff87a35d9e26c0d4ea41273a963c0eb20e18da4Zbigniew Jędrzejewski-Szmek Copyright 2011 Lennart Poettering
889a90422dd47284dffa32b9234a6e58991b000cRonny Chevalier systemd is free software; you can redistribute it and/or modify it
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer under the terms of the GNU Lesser General Public License as published by
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer the Free Software Foundation; either version 2.1 of the License, or
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer (at your option) any later version.
889a90422dd47284dffa32b9234a6e58991b000cRonny Chevalier systemd is distributed in the hope that it will be useful, but
889a90422dd47284dffa32b9234a6e58991b000cRonny Chevalier WITHOUT ANY WARRANTY; without even the implied warranty of
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
889a90422dd47284dffa32b9234a6e58991b000cRonny Chevalier Lesser General Public License for more details.
32d965851d8cbb39f8ee0eeaf76a89e8f5fc174fLennart Poettering You should have received a copy of the GNU Lesser General Public License
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer along with systemd; If not, see <http://www.gnu.org/licenses/>.
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer/* This is the minimum journal file size */
0d6e798a784ef0ba6b95512e4453067b2f84a91aHarald Hoyer#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
889a90422dd47284dffa32b9234a6e58991b000cRonny Chevalier/* These are the lower and upper bounds if we deduce the max_use value
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer * from the file system size */
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
6b197f2a03fa03a2a853cf726d47be2ea4c623b6Harald Hoyer#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer/* This is the upper bound if we deduce max_size from max_use */
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer/* This is the upper bound if we deduce the keep_free value from the
5c404f1ab8e96efedb983806443ca982a1b2a372Evgeny Vereshchagin * file system size */
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
c90feab4ff8bc23d88d4f9c67d9652ba189cb51bMichal Schmidt/* This is the keep_free value when we can't determine the system
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
0d6e798a784ef0ba6b95512e4453067b2f84a91aHarald Hoyer/* n_data was the first entry we added after the initial file format design */
6f9d3b08cf3e50d3903282d2ce36244bb86c7b7cDaniel Mack#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
6f9d3b08cf3e50d3903282d2ce36244bb86c7b7cDaniel Mack/* How many entries to keep in the entry array chain cache at max */
6f9d3b08cf3e50d3903282d2ce36244bb86c7b7cDaniel Mack/* How much to increase the journal file size at once each time we allocate something new. */
6f9d3b08cf3e50d3903282d2ce36244bb86c7b7cDaniel Mack#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
d9890f4ed47c0d565915360d8bae3b7a1428f285Harald Hoyer/* Reread fstat() of the file for detecting deletions at least this often */
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyer/* The mmap context to use for the header we pick as one above the last defined typed */
898720b7e9cf3bdf7a93e435cbed5dd6942ecf9bHarald Hoyerstatic int journal_file_set_online(JournalFile *f) {
return -EPERM;
return -EINVAL;
return -EIO;
case STATE_ONLINE:
case STATE_OFFLINE:
return -EINVAL;
assert(f);
if (!f->writable)
return -EPERM;
return -EINVAL;
return -EIO;
return -EIO;
assert(f);
#ifdef HAVE_GCRYPT
if (f->mmap)
#ifdef HAVE_GCRYPT
if (f->fss_file)
else if (f->fsprg_state)
if (f->hmac)
free(f);
Header h = {};
ssize_t k;
assert(f);
if (template) {
return -errno;
return -EIO;
assert(f);
f->tail_entry_monotonic_valid = true;
r = journal_file_set_online(f);
assert(f);
return -EBADMSG;
if (flags)
return -EPROTONOSUPPORT;
if (flags)
return -EPROTONOSUPPORT;
return -EBADMSG;
return -EBADMSG;
return -EBADMSG;
if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
return -ENODATA;
if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
return -ENODATA;
return -ENODATA;
if (f->writable) {
return -EHOSTDOWN;
return -EBUSY;
return -ESHUTDOWN;
return -EBUSY;
assert(f);
return -errno;
return -EIDRM;
assert(f);
return -EIO;
old_size =
return journal_file_fstat(f);
return -E2BIG;
available = 0;
return -E2BIG;
return journal_file_fstat(f);
static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
assert(f);
if (size <= 0)
return -EINVAL;
r = journal_file_fstat(f);
return -EADDRNOTAVAIL;
return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
return sizeof(ObjectHeader);
Object *o;
uint64_t s;
assert(f);
return -EFAULT;
o = (Object*) t;
if (s < sizeof(ObjectHeader))
return -EBADMSG;
return -EBADMSG;
if (s < minimum_header_size(o))
return -EBADMSG;
return -EBADMSG;
if (s > sizeof(ObjectHeader)) {
o = (Object*) t;
*ret = o;
uint64_t r;
assert(f);
if (seqnum) {
*seqnum = r;
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
uint64_t p;
assert(f);
r = journal_file_set_online(f);
o = (Object*) t;
*ret = o;
*offset = p;
uint64_t s, p;
Object *o;
assert(f);
if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
r = journal_file_append_object(f,
uint64_t s, p;
Object *o;
assert(f);
r = journal_file_append_object(f,
uint64_t s, p;
assert(f);
r = journal_file_move_to(f,
f->data_hash_table = t;
uint64_t s, p;
assert(f);
r = journal_file_move_to(f,
f->field_hash_table = t;
static int journal_file_link_field(
JournalFile *f,
Object *o,
uint64_t p, h, m;
assert(f);
assert(o);
return -EINVAL;
return -EBADMSG;
h = hash % m;
static int journal_file_link_data(
JournalFile *f,
Object *o,
uint64_t p, h, m;
assert(f);
assert(o);
return -EINVAL;
return -EBADMSG;
h = hash % m;
JournalFile *f,
assert(f);
return -EBADMSG;
h = hash % m;
Object *o;
if (ret)
*ret = o;
if (offset)
*offset = p;
JournalFile *f,
assert(f);
return journal_file_find_field_object_with_hash(f,
JournalFile *f,
assert(f);
return -EBADMSG;
h = hash % m;
Object *o;
goto next;
uint64_t l;
return -EBADMSG;
if (ret)
*ret = o;
if (offset)
*offset = p;
return -EPROTONOSUPPORT;
if (ret)
*ret = o;
if (offset)
*offset = p;
next:
JournalFile *f,
assert(f);
return journal_file_find_data_object_with_hash(f,
static int journal_file_append_field(
JournalFile *f,
Object *o;
assert(f);
if (ret)
*ret = o;
if (offset)
*offset = p;
#ifdef HAVE_GCRYPT
if (ret)
*ret = o;
if (offset)
*offset = p;
static int journal_file_append_data(
JournalFile *f,
Object *o;
int r, compression = 0;
const void *eq;
assert(f);
if (ret)
*ret = o;
if (offset)
*offset = p;
if (f->compress_xz &&
if (compression) {
if (!data)
#ifdef HAVE_GCRYPT
if (ret)
*ret = o;
if (offset)
*offset = p;
assert(o);
assert(o);
assert(o);
uint64_t p) {
Object *o;
assert(f);
assert(p > 0);
n = journal_file_entry_array_n_items(o);
ap = a;
if (hidx > n)
#ifdef HAVE_GCRYPT
if (ap == 0)
uint64_t p) {
assert(f);
assert(p > 0);
if (*idx == 0)
le64_t i;
uint64_t p;
assert(f);
assert(o);
return -EINVAL;
return link_entry_into_array_plus_one(f,
offset);
uint64_t n, i;
assert(f);
assert(o);
return -EINVAL;
r = link_entry_into_array(f,
offset);
/* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
f->tail_entry_monotonic_valid = true;
n = journal_file_entry_n_items(o);
static int journal_file_append_entry_internal(
JournalFile *f,
Object *o;
assert(f);
#ifdef HAVE_GCRYPT
if (ret)
*ret = o;
if (offset)
assert(f);
int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
assert(f);
if (!ts) {
if (f->tail_entry_monotonic_valid &&
return -EINVAL;
#ifdef HAVE_GCRYPT
for (i = 0; i < n_iovec; i++) {
uint64_t p;
Object *o;
r = -EIO;
typedef struct ChainCacheItem {
static void chain_cache_put(
OrderedHashmap *h,
if (!ci) {
if (!ci)
static int generic_array_get(
JournalFile *f,
uint64_t i,
Object *o;
uint64_t p = 0, a, t = 0;
assert(f);
a = first;
uint64_t k;
k = journal_file_entry_array_n_items(o);
goto found;
if (ret)
*ret = o;
if (offset)
*offset = p;
static int generic_array_get_plus_one(
JournalFile *f,
uint64_t i,
Object *o;
assert(f);
if (ret)
*ret = o;
if (offset)
static int generic_array_bisect(
JournalFile *f,
uint64_t n,
bool subtract_one = false;
assert(f);
a = first;
if (r == TEST_LEFT) {
if (right <= 0)
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT) {
left = 0;
if (last_index > 0) {
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = x;
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = y;
subtract_one = true;
i = left;
goto found;
return -EBADMSG;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = i;
subtract_one = true;
goto found;
if (subtract_one && t == 0 && i == 0)
chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
if (subtract_one && i == 0)
p = last_p;
else if (subtract_one)
if (ret)
*ret = o;
if (offset)
*offset = p;
if (idx)
static int generic_array_bisect_plus_one(
JournalFile *f,
uint64_t n,
bool step_back = false;
Object *o;
assert(f);
if (r == TEST_FOUND)
if (r == TEST_LEFT)
if (r == TEST_RIGHT) {
goto found;
if (r == 0 && step_back)
goto found;
if (r > 0 && idx)
(*idx) ++;
if (ret)
*ret = o;
if (offset)
if (idx)
*idx = 0;
assert(f);
assert(p > 0);
if (p == needle)
return TEST_FOUND;
else if (p < needle)
return TEST_LEFT;
return TEST_RIGHT;
Object *o;
assert(f);
assert(p > 0);
return TEST_FOUND;
return TEST_LEFT;
return TEST_RIGHT;
JournalFile *f,
return generic_array_bisect(f,
Object *o;
assert(f);
assert(p > 0);
return TEST_FOUND;
return TEST_LEFT;
return TEST_RIGHT;
JournalFile *f,
return generic_array_bisect(f,
Object *o;
assert(f);
assert(p > 0);
return TEST_FOUND;
return TEST_LEFT;
return TEST_RIGHT;
static int find_data_object_by_boot_id(
JournalFile *f,
Object **o,
uint64_t *b) {
JournalFile *f,
Object *o;
assert(f);
return -ENOENT;
return generic_array_bisect_plus_one(f,
f->current_offset = 0;
f->current_seqnum = 0;
f->current_realtime = 0;
f->current_monotonic = 0;
f->current_xor_hash = 0;
JournalFile *f,
uint64_t p,
assert(f);
r = generic_array_bisect(f,
r = generic_array_get(f,
f->path, i);
return -EBADMSG;
if (offset)
JournalFile *f,
uint64_t n, i;
Object *d;
assert(f);
assert(p > 0 || !o);
return -EINVAL;
return generic_array_get_plus_one(f,
JournalFile *f,
uint64_t p,
Object *d;
assert(f);
return generic_array_bisect_plus_one(f,
JournalFile *f,
Object *o, *d;
uint64_t b, z;
assert(f);
return -ENOENT;
uint64_t p, q;
if (ret)
if (offset)
*offset = q;
JournalFile *f,
Object *d;
assert(f);
return generic_array_bisect_plus_one(f,
JournalFile *f,
Object *d;
assert(f);
return generic_array_bisect_plus_one(f,
Object *o;
uint64_t p;
assert(f);
goto fail;
case OBJECT_UNUSED:
case OBJECT_DATA:
case OBJECT_FIELD:
case OBJECT_ENTRY:
case OBJECT_FIELD_HASH_TABLE:
case OBJECT_DATA_HASH_TABLE:
case OBJECT_ENTRY_ARRAY:
case OBJECT_TAG:
fail:
assert(f);
f->path,
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
unsigned attrs;
assert(f);
int journal_file_open(
const char *fname,
int flags,
bool compress,
bool seal,
bool newly_created = false;
JournalFile *f;
return -EINVAL;
return -EINVAL;
return -ENOMEM;
#if defined(HAVE_LZ4)
#ifdef HAVE_GCRYPT
if (mmap_cache)
if (!f->mmap) {
r = -ENOMEM;
goto fail;
if (!f->path) {
r = -ENOMEM;
goto fail;
if (!f->chain_cache) {
r = -ENOMEM;
goto fail;
if (f->fd < 0) {
r = -errno;
goto fail;
r = journal_file_fstat(f);
goto fail;
(void) journal_file_warn_btrfs(f);
#ifdef HAVE_GCRYPT
if (f->seal) {
r = journal_file_fss_load(f);
f->seal = false;
goto fail;
r = journal_file_fstat(f);
goto fail;
newly_created = true;
r = -EIO;
goto fail;
r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
goto fail;
f->header = h;
if (!newly_created) {
r = journal_file_verify_header(f);
goto fail;
#ifdef HAVE_GCRYPT
r = journal_file_fss_load(f);
goto fail;
if (f->writable) {
if (metrics) {
} else if (template)
r = journal_file_refresh_header(f);
goto fail;
#ifdef HAVE_GCRYPT
r = journal_file_hmac_setup(f);
goto fail;
if (newly_created) {
goto fail;
goto fail;
#ifdef HAVE_GCRYPT
r = journal_file_append_first_tag(f);
goto fail;
goto fail;
r = journal_file_map_data_hash_table(f);
goto fail;
r = -EIO;
goto fail;
*ret = f;
fail:
r = -EIO;
size_t l;
assert(f);
assert(*f);
old_file = *f;
return -EINVAL;
return -EINVAL;
return -ENOMEM;
return -errno;
r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
*f = new_file;
const char *fname,
int flags,
bool compress,
bool seal,
size_t l;
if (!IN_SET(r,
random_u64()) < 0)
return -ENOMEM;
return -errno;
(void) btrfs_defrag(p);
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
uint64_t i, n;
assert(o);
assert(p);
return -EPERM;
n = journal_file_entry_n_items(o);
uint64_t l, h;
size_t t;
void *data;
Object *u;
return -EBADMSG;
t = (size_t) l;
if ((uint64_t) t != l)
return -E2BIG;
l = rsize;
return -EPROTONOSUPPORT;
return -EIO;
assert(m);
if (fs_size > 0) {
if (fs_size > 0) {
assert(f);
if (from) {
return -ENOENT;
if (to) {
return -ENOENT;
int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
Object *o;
uint64_t p;
assert(f);
if (from) {
if (to) {
r = generic_array_get_plus_one(f,
&o, NULL);
assert(f);
if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
if (max_file_usec > 0) {
usec_t t, h;
if (h > 0 && t > h + max_file_usec)