/***
This file is part of systemd.
Copyright 2011 Lennart Poettering
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <errno.h>
#include <fcntl.h>
#include <stddef.h>
#include <unistd.h>
#include "alloc-util.h"
#include "btrfs-util.h"
#include "chattr-util.h"
#include "compress.h"
#include "fd-util.h"
#include "journal-authenticate.h"
#include "journal-def.h"
#include "journal-file.h"
#include "lookup3.h"
#include "parse-util.h"
#include "random-util.h"
#include "sd-event.h"
#include "string-util.h"
#include "xattr-util.h"
/* This is the minimum journal file size */
/* These are the lower and upper bounds if we deduce the max_use value
* from the file system size */
/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
/* This is the upper bound if we deduce max_size from max_use */
/* This is the upper bound if we deduce the keep_free value from the
* file system size */
/* This is the keep_free value when we can't determine the system
* size */
/* This is the default maximum number of journal files to keep around. */
/* n_data was the first entry we added after the initial file format design */
/* How many entries to keep in the entry array chain cache at max */
/* How much to increase the journal file size at once each time we allocate something new. */
/* Reread fstat() of the file for detecting deletions at least this often */
/* The mmap context to use for the header we pick as one above the last defined typed */
assert(f);
if (!f->writable)
return -EPERM;
return -EINVAL;
return -EIO;
case STATE_ONLINE:
return 0;
case STATE_OFFLINE:
return 0;
default:
return -EINVAL;
}
}
assert(f);
if (!f->writable)
return -EPERM;
return -EINVAL;
return 0;
return -EIO;
return -EIO;
return 0;
}
assert(f);
#ifdef HAVE_GCRYPT
/* Write the final tag */
#endif
if (f->post_change_timer) {
int enabled;
if (enabled == SD_EVENT_ONESHOT)
}
if (f->fd >= 0 && f->defrag_on_close) {
/* Be friendly to btrfs: turn COW back on again now,
* and defragment the file. We won't write to the file
* ever again, hence remove all fragmentation, and
* reenable all the good bits COW usually provides
* (such as data checksumming). */
(void) btrfs_defrag_fd(f->fd);
}
safe_close(f->fd);
mmap_cache_unref(f->mmap);
free(f->compress_buffer);
#endif
#ifdef HAVE_GCRYPT
if (f->fss_file)
else
free(f->fsprg_state);
free(f->fsprg_seed);
if (f->hmac)
gcry_md_close(f->hmac);
#endif
free(f);
return NULL;
}
Header h = {};
ssize_t k;
int r;
assert(f);
h.incompatible_flags |= htole32(
h.compatible_flags = htole32(
f->seal * HEADER_COMPATIBLE_SEALED);
r = sd_id128_randomize(&h.file_id);
if (r < 0)
return r;
if (template) {
} else
if (k < 0)
return -errno;
if (k != sizeof(h))
return -EIO;
return 0;
}
int r;
assert(f);
if (r < 0)
return r;
r = sd_id128_get_boot(&boot_id);
if (r < 0)
return r;
f->tail_entry_monotonic_valid = true;
r = journal_file_set_online(f);
/* Sync the online state to disk */
return r;
}
assert(f);
return -EBADMSG;
/* In both read and write mode we refuse to open files with
* incompatible flags we don't know */
if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
if (flags & ~HEADER_INCOMPATIBLE_ANY)
if (flags)
return -EPROTONOSUPPORT;
}
/* When open for writing we refuse to open files with
* compatible flags, too */
if (flags & ~HEADER_COMPATIBLE_ANY)
if (flags)
return -EPROTONOSUPPORT;
}
return -EBADMSG;
/* The first addition was n_data, so check that we are at least this large */
return -EBADMSG;
return -EBADMSG;
if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
return -ENODATA;
if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
return -ENODATA;
return -ENODATA;
if (f->writable) {
int r;
r = sd_id128_get_machine(&machine_id);
if (r < 0)
return r;
return -EHOSTDOWN;
if (state == STATE_ONLINE) {
return -EBUSY;
} else if (state == STATE_ARCHIVED)
return -ESHUTDOWN;
else if (state != STATE_OFFLINE) {
return -EBUSY;
}
}
return 0;
}
assert(f);
return -errno;
/* Refuse appending to files that are already deleted */
return -EIDRM;
return 0;
}
int r;
assert(f);
/* We assume that this file is not sparse, and we know that
* for sure, since we always call posix_fallocate()
* ourselves */
return -EIO;
old_size =
/* We already pre-allocated enough space, but before
* we write to it, let's check with fstat() if the
* file got deleted, in order make sure we don't throw
* away the data immediately. Don't check fstat() for
* all writes though, but only once ever 10s. */
return 0;
return journal_file_fstat(f);
}
/* Allocate more space. */
return -E2BIG;
return -E2BIG;
}
}
/* Increase by larger blocks at once */
/* Note that the glibc fallocate() fallback is very
inefficient, hence we try to minimize the allocation area
as we can. */
if (r != 0)
return -r;
return journal_file_fstat(f);
}
/* One context for each type, plus one catch-all for the rest */
}
static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
int r;
assert(f);
if (size <= 0)
return -EINVAL;
/* Avoid SIGBUS on invalid accesses */
/* Hmm, out of range? Let's refresh the fstat() data
* first, before we trust that check. */
r = journal_file_fstat(f);
if (r < 0)
return r;
return -EADDRNOTAVAIL;
}
return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
}
[OBJECT_DATA] = sizeof(DataObject),
[OBJECT_FIELD] = sizeof(FieldObject),
[OBJECT_ENTRY] = sizeof(EntryObject),
[OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
[OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
[OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
[OBJECT_TAG] = sizeof(TagObject),
};
return sizeof(ObjectHeader);
}
int r;
void *t;
Object *o;
uint64_t s;
assert(f);
/* Objects may only be located at multiple of 64 bit */
return -EFAULT;
if (r < 0)
return r;
o = (Object*) t;
if (s < sizeof(ObjectHeader))
return -EBADMSG;
return -EBADMSG;
if (s < minimum_header_size(o))
return -EBADMSG;
return -EBADMSG;
if (s > sizeof(ObjectHeader)) {
if (r < 0)
return r;
o = (Object*) t;
}
*ret = o;
return 0;
}
uint64_t r;
assert(f);
if (seqnum) {
/* If an external seqnum counter was passed, we update
* both the local and the external one, and set it to
* the maximum of both */
if (*seqnum + 1 > r)
r = *seqnum + 1;
*seqnum = r;
}
if (f->header->head_entry_seqnum == 0)
return r;
}
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
int r;
uint64_t p;
void *t;
assert(f);
r = journal_file_set_online(f);
if (r < 0)
return r;
if (p == 0)
else {
if (r < 0)
return r;
}
r = journal_file_allocate(f, p, size);
if (r < 0)
return r;
if (r < 0)
return r;
o = (Object*) t;
*ret = o;
*offset = p;
return 0;
}
uint64_t s, p;
Object *o;
int r;
assert(f);
/* We estimate that we need 1 hash table entry per 768 bytes
of journal file and we want to make sure we never get
beyond 75% fill level. Calculate the hash table size for
the maximum file size based on these metrics. */
if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
r = journal_file_append_object(f,
&o, &p);
if (r < 0)
return r;
return 0;
}
uint64_t s, p;
Object *o;
int r;
assert(f);
/* We use a fixed size hash table for the fields as this
* number should grow very slowly only */
r = journal_file_append_object(f,
&o, &p);
if (r < 0)
return r;
return 0;
}
uint64_t s, p;
void *t;
int r;
assert(f);
if (f->data_hash_table)
return 0;
r = journal_file_move_to(f,
true,
p, s,
&t);
if (r < 0)
return r;
f->data_hash_table = t;
return 0;
}
uint64_t s, p;
void *t;
int r;
assert(f);
if (f->field_hash_table)
return 0;
r = journal_file_move_to(f,
true,
p, s,
&t);
if (r < 0)
return r;
f->field_hash_table = t;
return 0;
}
static int journal_file_link_field(
JournalFile *f,
Object *o,
uint64_t p, h, m;
int r;
assert(f);
assert(f->field_hash_table);
assert(o);
return -EINVAL;
if (m <= 0)
return -EBADMSG;
/* This might alter the window we are looking at */
h = hash % m;
if (p == 0)
else {
r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
if (r < 0)
return r;
}
return 0;
}
static int journal_file_link_data(
JournalFile *f,
Object *o,
uint64_t p, h, m;
int r;
assert(f);
assert(f->data_hash_table);
assert(o);
return -EINVAL;
if (m <= 0)
return -EBADMSG;
/* This might alter the window we are looking at */
h = hash % m;
if (p == 0)
/* Only entry in the hash table is easy */
else {
/* Move back to the previous data object, to patch in
* pointer */
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
}
return 0;
}
JournalFile *f,
int r;
assert(f);
/* If the field hash table is empty, we can't find anything */
return 0;
/* Map the field hash table, if it isn't mapped yet. */
if (r < 0)
return r;
if (m <= 0)
return -EBADMSG;
h = hash % m;
while (p > 0) {
Object *o;
r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
}
return 0;
}
JournalFile *f,
assert(f);
return journal_file_find_field_object_with_hash(f,
}
JournalFile *f,
int r;
assert(f);
/* If there's no data hash table, then there's no entry. */
return 0;
/* Map the data hash table, if it isn't mapped yet. */
r = journal_file_map_data_hash_table(f);
if (r < 0)
return r;
if (m <= 0)
return -EBADMSG;
h = hash % m;
while (p > 0) {
Object *o;
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
goto next;
uint64_t l;
return -EBADMSG;
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
#else
return -EPROTONOSUPPORT;
#endif
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
next:
}
return 0;
}
JournalFile *f,
assert(f);
return journal_file_find_data_object_with_hash(f,
}
static int journal_file_append_field(
JournalFile *f,
Object *o;
int r;
assert(f);
if (r < 0)
return r;
else if (r > 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
if (r < 0)
return r;
r = journal_file_link_field(f, o, p, hash);
if (r < 0)
return r;
/* The linking might have altered the window, so let's
* refresh our pointer */
r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
if (r < 0)
return r;
#ifdef HAVE_GCRYPT
r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
if (r < 0)
return r;
#endif
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
static int journal_file_append_data(
JournalFile *f,
Object *o;
int r, compression = 0;
const void *eq;
assert(f);
if (r < 0)
return r;
if (r > 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
if (r < 0)
return r;
if (compression >= 0) {
} else
/* Compression didn't work, we don't really care why, let's continue without compression */
compression = 0;
}
#endif
if (compression == 0 && size > 0)
r = journal_file_link_data(f, o, p, hash);
if (r < 0)
return r;
/* The linking might have altered the window, so let's
* refresh our pointer */
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
if (!data)
else
/* Create field object ... */
if (r < 0)
return r;
/* ... and link it in. */
}
#ifdef HAVE_GCRYPT
r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
if (r < 0)
return r;
#endif
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
assert(o);
return 0;
}
assert(o);
return 0;
}
assert(o);
return 0;
}
uint64_t p) {
int r;
Object *o;
assert(f);
assert(p > 0);
while (a > 0) {
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
if (r < 0)
return r;
n = journal_file_entry_array_n_items(o);
if (i < n) {
return 0;
}
i -= n;
ap = a;
}
if (hidx > n)
else
n = n * 2;
if (n < 4)
n = 4;
&o, &q);
if (r < 0)
return r;
#ifdef HAVE_GCRYPT
r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
if (r < 0)
return r;
#endif
if (ap == 0)
else {
if (r < 0)
return r;
}
return 0;
}
uint64_t p) {
int r;
assert(f);
assert(p > 0);
if (*idx == 0)
else {
le64_t i;
r = link_entry_into_array(f, first, &i, p);
if (r < 0)
return r;
}
return 0;
}
uint64_t p;
int r;
assert(f);
assert(o);
if (p == 0)
return -EINVAL;
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
return link_entry_into_array_plus_one(f,
&o->data.entry_offset,
&o->data.entry_array_offset,
offset);
}
uint64_t n, i;
int r;
assert(f);
assert(o);
return -EINVAL;
/* Link up the entry itself */
r = link_entry_into_array(f,
&f->header->entry_array_offset,
offset);
if (r < 0)
return r;
/* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
if (f->header->head_entry_realtime == 0)
f->tail_entry_monotonic_valid = true;
/* Link up the items */
n = journal_file_entry_n_items(o);
for (i = 0; i < n; i++) {
r = journal_file_link_entry_item(f, o, offset, i);
if (r < 0)
return r;
}
return 0;
}
static int journal_file_append_entry_internal(
JournalFile *f,
const dual_timestamp *ts,
Object *o;
int r;
assert(f);
if (r < 0)
return r;
#ifdef HAVE_GCRYPT
if (r < 0)
return r;
#endif
r = journal_file_link_entry(f, o, np);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
return 0;
}
assert(f);
/* inotify() does not receive IN_MODIFY events from file
* accesses done via mmap(). After each access we hence
* trigger IN_MODIFY by truncating the journal file to its
* current size which triggers IN_MODIFY. */
}
return 1;
}
int enabled, r;
assert(f);
assert(f->post_change_timer);
timer = f->post_change_timer;
if (r < 0) {
log_debug_errno(r, "Failed to get ftruncate timer state: %m");
goto fail;
}
if (enabled == SD_EVENT_ONESHOT)
return;
if (r < 0) {
log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
goto fail;
}
if (r < 0) {
log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
goto fail;
}
if (r < 0) {
log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
goto fail;
}
return;
fail:
/* On failure, let's simply post the change immediately. */
}
/* Enable coalesced change posting in a timer on the provided sd_event instance */
int r;
assert(f);
assert(e);
assert(t);
if (r < 0)
return r;
if (r < 0)
return r;
f->post_change_timer = timer;
f->post_change_timer_period = t;
return r;
}
return -1;
return 1;
return 0;
}
int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
unsigned i;
int r;
assert(f);
if (!ts) {
}
#ifdef HAVE_GCRYPT
if (r < 0)
return r;
#endif
/* alloca() can't take 0, hence let's allocate at least one */
for (i = 0; i < n_iovec; i++) {
uint64_t p;
Object *o;
if (r < 0)
return r;
}
/* Order by the position on disk, in order to improve seek
* times for rotating media. */
/* If the memory mapping triggered a SIGBUS then we return an
* IO error and ignore the error code passed down to us, since
* it is very likely just an effect of a nullified replacement
* mapping page */
r = -EIO;
if (f->post_change_timer)
else
return r;
}
typedef struct ChainCacheItem {
static void chain_cache_put(
OrderedHashmap *h,
if (!ci) {
/* If the chain item to cache for this chain is the
* first one it's not worth caching anything */
return;
if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
} else {
if (!ci)
return;
}
return;
}
} else
}
static int generic_array_get(
JournalFile *f,
uint64_t i,
Object *o;
uint64_t p = 0, a, t = 0;
int r;
assert(f);
a = first;
/* Try the chain cache first */
}
while (a > 0) {
uint64_t k;
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
if (r < 0)
return r;
k = journal_file_entry_array_n_items(o);
if (i < k) {
goto found;
}
i -= k;
t += k;
}
return 0;
/* Let's cache this item for the next invocation */
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
static int generic_array_get_plus_one(
JournalFile *f,
uint64_t i,
Object *o;
assert(f);
if (i == 0) {
int r;
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
return 1;
}
}
enum {
};
static int generic_array_bisect(
JournalFile *f,
uint64_t n,
bool subtract_one = false;
int r;
assert(f);
/* Start with the first array in the chain */
a = first;
/* Ah, we have iterated this bisection array chain
* previously! Let's see if we can skip ahead in the
* chain, as far as the last time. But we can't jump
* backwards in the chain, so let's check that
* first. */
if (r < 0)
return r;
if (r == TEST_LEFT) {
/* OK, what we are looking for is right of the
* begin of this EntryArray, so let's jump
* straight to previously cached array in the
* chain */
}
}
while (a > 0) {
if (r < 0)
return r;
if (right <= 0)
return 0;
i = right - 1;
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
if (r == TEST_RIGHT) {
left = 0;
right -= 1;
/* If we cached the last index we
* looked at, let's try to not to jump
* too wildly around and see if we can
* limit the range to look at early to
* the immediate neighbors of the last
* index we looked at. */
if (last_index > 0) {
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = x;
else
left = x + 1;
}
if (last_index < right) {
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = y;
else
left = y + 1;
}
}
for (;;) {
if (direction == DIRECTION_UP)
subtract_one = true;
i = left;
goto found;
}
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
if (r == TEST_RIGHT)
right = i;
else
left = i + 1;
}
}
if (k >= n) {
if (direction == DIRECTION_UP) {
i = n;
subtract_one = true;
goto found;
}
return 0;
}
n -= k;
t += k;
}
return 0;
if (subtract_one && t == 0 && i == 0)
return 0;
/* Let's cache this item for the next invocation */
chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
if (subtract_one && i == 0)
p = last_p;
else if (subtract_one)
else
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
if (idx)
return 1;
}
static int generic_array_bisect_plus_one(
JournalFile *f,
uint64_t n,
int r;
bool step_back = false;
Object *o;
assert(f);
if (n <= 0)
return 0;
/* This bisects the array in object 'first', but first checks
* an extra */
if (r < 0)
return r;
if (r == TEST_FOUND)
/* if we are looking with DIRECTION_UP then we need to first
see if in the actual array there is a matching entry, and
return the last one of that. But if there isn't any we need
to return this one. Hence remember this, and return it
below. */
if (r == TEST_LEFT)
if (r == TEST_RIGHT) {
if (direction == DIRECTION_DOWN)
goto found;
else
return 0;
}
if (r == 0 && step_back)
goto found;
if (r > 0 && idx)
(*idx) ++;
return r;
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
if (idx)
*idx = 0;
return 1;
}
assert(f);
assert(p > 0);
if (p == needle)
return TEST_FOUND;
else if (p < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
return TEST_FOUND;
return TEST_LEFT;
else
return TEST_RIGHT;
}
JournalFile *f,
assert(f);
return generic_array_bisect(f,
}
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
return TEST_FOUND;
return TEST_LEFT;
else
return TEST_RIGHT;
}
JournalFile *f,
assert(f);
return generic_array_bisect(f,
}
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
return TEST_FOUND;
return TEST_LEFT;
else
return TEST_RIGHT;
}
static int find_data_object_by_boot_id(
JournalFile *f,
Object **o,
uint64_t *b) {
char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
}
JournalFile *f,
Object *o;
int r;
assert(f);
if (r < 0)
return r;
if (r == 0)
return -ENOENT;
return generic_array_bisect_plus_one(f,
}
f->location_type = LOCATION_HEAD;
f->current_offset = 0;
f->current_seqnum = 0;
f->current_realtime = 0;
f->current_monotonic = 0;
zero(f->current_boot_id);
f->current_xor_hash = 0;
}
f->location_type = LOCATION_SEEK;
f->current_offset = offset;
}
/* If contents and timestamps match, these entries are
* identical, even if the seqnum does not match */
return 0;
/* If this is from the same seqnum source, compare
* seqnums */
return -1;
return 1;
/* Wow! This is weird, different data but the same
* seqnums? Something is borked, but let's make the
* best of it and compare by time. */
}
/* If the boot id matches, compare monotonic time */
return -1;
return 1;
}
/* Otherwise, compare UTC time */
return -1;
return 1;
/* Finally, compare by contents */
return -1;
return 1;
return 0;
}
JournalFile *f,
uint64_t p,
int r;
assert(f);
if (n <= 0)
return 0;
if (p == 0)
else {
r = generic_array_bisect(f,
p,
&i);
if (r <= 0)
return r;
if (direction == DIRECTION_DOWN) {
if (i >= n - 1)
return 0;
i++;
} else {
if (i <= 0)
return 0;
i--;
}
}
/* And jump to it */
r = generic_array_get(f,
i,
if (r <= 0)
return r;
if (p > 0 &&
f->path, i);
return -EBADMSG;
}
if (offset)
return 1;
}
JournalFile *f,
uint64_t n, i;
int r;
Object *d;
assert(f);
assert(p > 0 || !o);
if (r < 0)
return r;
if (n <= 0)
return n;
if (!o)
else {
return -EINVAL;
p,
&i);
if (r <= 0)
return r;
if (direction == DIRECTION_DOWN) {
if (i >= n - 1)
return 0;
i++;
} else {
if (i <= 0)
return 0;
i--;
}
}
return generic_array_get_plus_one(f,
i,
}
JournalFile *f,
uint64_t p,
int r;
Object *d;
assert(f);
if (r < 0)
return r;
return generic_array_bisect_plus_one(f,
p,
}
JournalFile *f,
Object *o, *d;
int r;
uint64_t b, z;
assert(f);
/* First, seek by time */
r = find_data_object_by_boot_id(f, boot_id, &o, &b);
if (r < 0)
return r;
if (r == 0)
return -ENOENT;
if (r <= 0)
return r;
/* And now, continue seeking until we find an entry that
* exists in both bisection arrays */
for (;;) {
uint64_t p, q;
if (r < 0)
return r;
z,
if (r <= 0)
return r;
r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
if (r < 0)
return r;
p,
if (r <= 0)
return r;
if (p == q) {
if (ret)
if (offset)
*offset = q;
return 1;
}
z = q;
}
}
JournalFile *f,
Object *d;
int r;
assert(f);
if (r < 0)
return r;
return generic_array_bisect_plus_one(f,
}
JournalFile *f,
Object *d;
int r;
assert(f);
if (r < 0)
return r;
return generic_array_bisect_plus_one(f,
}
Object *o;
int r;
uint64_t p;
assert(f);
while (p != 0) {
r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
if (r < 0)
goto fail;
case OBJECT_UNUSED:
printf("Type: OBJECT_UNUSED\n");
break;
case OBJECT_DATA:
printf("Type: OBJECT_DATA\n");
break;
case OBJECT_FIELD:
printf("Type: OBJECT_FIELD\n");
break;
case OBJECT_ENTRY:
break;
case OBJECT_FIELD_HASH_TABLE:
printf("Type: OBJECT_FIELD_HASH_TABLE\n");
break;
case OBJECT_DATA_HASH_TABLE:
printf("Type: OBJECT_DATA_HASH_TABLE\n");
break;
case OBJECT_ENTRY_ARRAY:
printf("Type: OBJECT_ENTRY_ARRAY\n");
break;
case OBJECT_TAG:
break;
default:
break;
}
printf("Flags: %s\n",
p = 0;
else
}
return;
fail:
log_error("File corrupt");
}
const char *x;
x = format_timestamp(buf, l, t);
if (x)
return x;
return " --- ";
}
char a[33], b[33], c[33], d[33];
assert(f);
printf("File Path: %s\n"
"File ID: %s\n"
"Machine ID: %s\n"
"Boot ID: %s\n"
"Sequential Number ID: %s\n"
"State: %s\n"
"Compatible Flags:%s%s\n"
"Incompatible Flags:%s%s%s\n"
"Rotate Suggested: %s\n"
"Head Realtime Timestamp: %s\n"
"Tail Realtime Timestamp: %s\n"
"Tail Monotonic Timestamp: %s\n"
f->path,
yes_no(journal_file_rotate_suggested(f, 0)),
"Data Hash Table Fill: %.1f%%\n",
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
"Field Hash Table Fill: %.1f%%\n",
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
}
unsigned attrs;
int r;
assert(f);
/* Before we write anything, check if the COW logic is turned
* off on btrfs. Given our write pattern that is quite
* unfriendly to COW file systems this should greatly improve
* performance on COW file systems, such as btrfs, at the
* expense of data integrity features (which shouldn't be too
* bad, given that we do our own checksumming). */
r = btrfs_is_filesystem(f->fd);
if (r < 0)
return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
if (!r)
return 0;
if (r < 0)
return log_warning_errno(r, "Failed to read file attributes: %m");
if (attrs & FS_NOCOW_FL) {
log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
return 0;
}
log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
"This is likely to slow down journal access substantially, please consider turning "
"off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
return 1;
}
int journal_file_open(
const char *fname,
int flags,
bool compress,
bool seal,
JournalFile **ret) {
bool newly_created = false;
JournalFile *f;
void *h;
int r;
return -EINVAL;
return -EINVAL;
if (!f)
return -ENOMEM;
f->fd = -1;
#if defined(HAVE_LZ4)
f->compress_lz4 = compress;
f->compress_xz = compress;
#endif
#ifdef HAVE_GCRYPT
#endif
if (mmap_cache)
else {
f->mmap = mmap_cache_new();
if (!f->mmap) {
r = -ENOMEM;
goto fail;
}
}
if (!f->path) {
r = -ENOMEM;
goto fail;
}
if (!f->chain_cache) {
r = -ENOMEM;
goto fail;
}
if (f->fd < 0) {
r = -errno;
goto fail;
}
r = journal_file_fstat(f);
if (r < 0)
goto fail;
(void) journal_file_warn_btrfs(f);
/* Let's attach the creation time to the journal file,
* so that the vacuuming code knows the age of this
* file even if the file might end up corrupted one
* day... Ideally we'd just use the creation time many
* file systems maintain for each file, but there is
* currently no usable API to query this, hence let's
* emulate this via extended attributes. If extended
* attributes are not supported we'll just skip this,
fd_setcrtime(f->fd, 0);
#ifdef HAVE_GCRYPT
/* Try to load the FSPRG state, and if we can't, then
* just don't do sealing */
if (f->seal) {
r = journal_file_fss_load(f);
if (r < 0)
f->seal = false;
}
#endif
r = journal_file_init_header(f, template);
if (r < 0)
goto fail;
r = journal_file_fstat(f);
if (r < 0)
goto fail;
newly_created = true;
}
r = -ENODATA;
goto fail;
}
r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
if (r < 0)
goto fail;
f->header = h;
if (!newly_created) {
r = journal_file_verify_header(f);
if (r < 0)
goto fail;
}
#ifdef HAVE_GCRYPT
if (!newly_created && f->writable) {
r = journal_file_fss_load(f);
if (r < 0)
goto fail;
}
#endif
if (f->writable) {
if (metrics) {
} else if (template)
r = journal_file_refresh_header(f);
if (r < 0)
goto fail;
}
#ifdef HAVE_GCRYPT
r = journal_file_hmac_setup(f);
if (r < 0)
goto fail;
#endif
if (newly_created) {
if (r < 0)
goto fail;
if (r < 0)
goto fail;
#ifdef HAVE_GCRYPT
r = journal_file_append_first_tag(f);
if (r < 0)
goto fail;
#endif
}
r = -EIO;
goto fail;
}
f,
if (r < 0)
goto fail;
}
*ret = f;
return 0;
fail:
r = -EIO;
return r;
}
_cleanup_free_ char *p = NULL;
size_t l;
int r;
assert(f);
assert(*f);
old_file = *f;
return -EINVAL;
return -EINVAL;
if (r < 0)
return -ENOMEM;
/* Try to rename the file to the archived version. If the file
* already was deleted, we'll get ENOENT, let's ignore that
* case. */
return -errno;
/* Currently, btrfs is not very good with out write patterns
* and fragments heavily. Let's defrag our journal files when
* we archive them */
old_file->defrag_on_close = true;
r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
*f = new_file;
return r;
}
const char *fname,
int flags,
bool compress,
bool seal,
JournalFile **ret) {
int r;
size_t l;
_cleanup_free_ char *p = NULL;
if (!IN_SET(r,
-EBADMSG, /* corrupted */
-ENODATA, /* truncated */
-EHOSTDOWN, /* other machine */
-EPROTONOSUPPORT, /* incompatible feature */
-EBUSY, /* unclean shutdown */
-ESHUTDOWN, /* already archived */
-EIO, /* IO error, including SIGBUS on mmap */
-EIDRM /* File has been deleted */))
return r;
return r;
return r;
return r;
/* The file is corrupted. Rotate it away and try it again (but only once) */
(int) l - 8, fname,
random_u64()) < 0)
return -ENOMEM;
return -errno;
/* btrfs doesn't cope well with our write pattern and
* fragments heavily. Let's defrag all files we rotate */
(void) chattr_path(p, false, FS_NOCOW_FL);
(void) btrfs_defrag(p);
}
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
uint64_t i, n;
int r;
assert(o);
assert(p);
return -EPERM;
n = journal_file_entry_n_items(o);
/* alloca() can't take 0, hence let's allocate at least one */
for (i = 0; i < n; i++) {
uint64_t l, h;
size_t t;
void *data;
Object *u;
if (r < 0)
return r;
return -EBADMSG;
t = (size_t) l;
/* We hit the limit on 32bit machines */
if ((uint64_t) t != l)
return -E2BIG;
if (r < 0)
return r;
l = rsize;
#else
return -EPROTONOSUPPORT;
#endif
} else
if (r < 0)
return r;
if (r < 0)
return r;
}
return -EIO;
return r;
}
assert(m);
/* Set everything to "pick automatic values". */
*m = (JournalMetrics) {
};
}
char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
assert(m);
else {
fs_size = 0;
}
if (fs_size > 0) {
if (m->max_use > DEFAULT_MAX_USE_UPPER)
m->max_use = DEFAULT_MAX_USE_UPPER;
if (m->max_use < DEFAULT_MAX_USE_LOWER)
m->max_use = DEFAULT_MAX_USE_LOWER;
} else
m->max_use = DEFAULT_MAX_USE_LOWER;
} else {
}
m->min_use = DEFAULT_MIN_USE;
if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
} else
if (m->max_size != 0) {
if (m->max_size < JOURNAL_FILE_SIZE_MIN)
}
else {
if (m->min_size < JOURNAL_FILE_SIZE_MIN)
}
if (fs_size > 0) {
if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
} else
m->keep_free = DEFAULT_KEEP_FREE;
}
format_bytes(a, sizeof(a), m->min_use),
format_bytes(b, sizeof(b), m->max_use),
format_bytes(c, sizeof(c), m->max_size),
format_bytes(d, sizeof(d), m->min_size),
format_bytes(e, sizeof(e), m->keep_free),
m->n_max_files);
}
assert(f);
if (from) {
if (f->header->head_entry_realtime == 0)
return -ENOENT;
}
if (to) {
if (f->header->tail_entry_realtime == 0)
return -ENOENT;
}
return 1;
}
int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
Object *o;
uint64_t p;
int r;
assert(f);
r = find_data_object_by_boot_id(f, boot_id, &o, &p);
if (r <= 0)
return r;
return 0;
if (from) {
if (r < 0)
return r;
}
if (to) {
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
r = generic_array_get_plus_one(f,
&o, NULL);
if (r <= 0)
return r;
}
return 1;
}
assert(f);
/* If we gained new header fields we gained new features,
* hence suggest a rotation */
return true;
}
/* Let's check if the hash tables grew over a certain fill
* level (75%, borrowing this value from Java's hash table
* implementation), and if so suggest a rotation. To calculate
* the fill level we need the n_data field, which only exists
* in newer versions. */
if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
return true;
}
if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
return true;
}
/* Are the data objects properly indexed by field objects? */
return true;
if (max_file_usec > 0) {
usec_t t, h;
t = now(CLOCK_REALTIME);
if (h > 0 && t > h + max_file_usec)
return true;
}
return false;
}