journal-file.c revision ccdbaf911196e2fc80eaa914bf22e01035ee1f3f
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering/***
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering This file is part of systemd.
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering Copyright 2011 Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering systemd is free software; you can redistribute it and/or modify it
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering under the terms of the GNU General Public License as published by
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering the Free Software Foundation; either version 2 of the License, or
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering (at your option) any later version.
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering systemd is distributed in the hope that it will be useful, but
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering WITHOUT ANY WARRANTY; without even the implied warranty of
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering General Public License for more details.
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering You should have received a copy of the GNU General Public License
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering along with systemd; If not, see <http://www.gnu.org/licenses/>.
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering***/
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering#include <sys/mman.h>
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering#include <errno.h>
3c0cf502796be355431d4a64d738e75f543aa51dLennart Poettering#include <sys/uio.h>
3c0cf502796be355431d4a64d738e75f543aa51dLennart Poettering#include <unistd.h>
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering#include <sys/statvfs.h>
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering#include <fcntl.h>
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering#include <stddef.h>
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering#include "journal-def.h"
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering#include "journal-file.h"
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering#include "lookup3.h"
e3309036cda30bdb737ef6e441716b93943677e7Zbigniew Jędrzejewski-Szmek#include "compress.h"
e3309036cda30bdb737ef6e441716b93943677e7Zbigniew Jędrzejewski-Szmek
e3309036cda30bdb737ef6e441716b93943677e7Zbigniew Jędrzejewski-Szmek#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
e3309036cda30bdb737ef6e441716b93943677e7Zbigniew Jędrzejewski-Szmek#define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen#define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen#define COMPRESSION_SIZE_THRESHOLD (512ULL)
9c5e12a4314e7192e834e1b855e5e80111e636a6Tom Gundersen
7586f4d172dd9c3ccc3126fc47dca9e49adec132Tom Gundersen/* This is the minimum journal file size */
d74fb368b18f0fbd9a4fe6f15691bbea7f3c4a01Tom Gundersen#define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen/* These are the lower and upper bounds if we deduce the max_use value
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen * from the file system size */
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen/* This is the upper bound if we deduce max_size from max_use */
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen/* This is the upper bound if we deduce the keep_free value from the
3e684349c2cead2e6fd2f816c34eb17daba23a49Lennart Poettering * file system size */
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
3e684349c2cead2e6fd2f816c34eb17daba23a49Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering/* This is the keep_free value when we can't determine the system
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering * size */
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
91b14d6ff362b938a72db17b095ee9903d07381bTom Gundersen
91b14d6ff362b938a72db17b095ee9903d07381bTom Gundersenstatic const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering
3c0cf502796be355431d4a64d738e75f543aa51dLennart Poettering#define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
3c0cf502796be355431d4a64d738e75f543aa51dLennart Poettering
0dd25fb9f005d8ab7ac4bc10a609d00569f8c56aLennart Poetteringvoid journal_file_close(JournalFile *f) {
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering int t;
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
6cb08a8930bdaca950b152b1e8b82466ed59511cLennart Poettering assert(f);
6cb08a8930bdaca950b152b1e8b82466ed59511cLennart Poettering
9df3ba6c6cb65eecec06f39dfe85a3596cedac4eTom Gundersen if (f->header && f->writable)
9df3ba6c6cb65eecec06f39dfe85a3596cedac4eTom Gundersen f->header->state = STATE_OFFLINE;
9df3ba6c6cb65eecec06f39dfe85a3596cedac4eTom Gundersen
f4461e5641d53f27d6e76e0607bdaa9c0c58c1f6Lennart Poettering
f4461e5641d53f27d6e76e0607bdaa9c0c58c1f6Lennart Poettering for (t = 0; t < _WINDOW_MAX; t++)
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering if (f->windows[t].ptr)
d74fb368b18f0fbd9a4fe6f15691bbea7f3c4a01Tom Gundersen munmap(f->windows[t].ptr, f->windows[t].size);
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering
6bb2c08597c999c429e889cd2403b2fef5f3e1a0Lennart Poettering if (f->fd >= 0)
6bb2c08597c999c429e889cd2403b2fef5f3e1a0Lennart Poettering close_nointr_nofail(f->fd);
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering
6bb2c08597c999c429e889cd2403b2fef5f3e1a0Lennart Poettering free(f->path);
6bb2c08597c999c429e889cd2403b2fef5f3e1a0Lennart Poettering
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering#ifdef HAVE_XZ
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering free(f->compress_buffer);
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering#endif
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen
be808ea083fa07271116b4519c3c27fd20c5f077Tom Gundersen free(f);
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering}
1e02e182f1e06fcbe389474175de228103be39cbLennart Poettering
1e02e182f1e06fcbe389474175de228103be39cbLennart Poetteringstatic int journal_file_init_header(JournalFile *f, JournalFile *template) {
1e02e182f1e06fcbe389474175de228103be39cbLennart Poettering Header h;
b652d4a2099d1c167584dcc1d179d47c58dc38a2Lennart Poettering ssize_t k;
b652d4a2099d1c167584dcc1d179d47c58dc38a2Lennart Poettering int r;
b652d4a2099d1c167584dcc1d179d47c58dc38a2Lennart Poettering
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering assert(f);
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering zero(h);
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering memcpy(h.signature, signature, 8);
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering h.arena_offset = htole64(ALIGN64(sizeof(h)));
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering r = sd_id128_randomize(&h.file_id);
0b58db658b5c3f586ac3a837427f1f7fec2abb2eLennart Poettering if (r < 0)
4e945a6f7971fd7d1f6b2c62ee3afdaff3c95ce4Lennart Poettering return r;
0b58db658b5c3f586ac3a837427f1f7fec2abb2eLennart Poettering
0dd25fb9f005d8ab7ac4bc10a609d00569f8c56aLennart Poettering if (template) {
3c0cf502796be355431d4a64d738e75f543aa51dLennart Poettering h.seqnum_id = template->header->seqnum_id;
74b2466e14a1961bf3ac0e8a60cfaceec705bd59Lennart Poettering h.seqnum = template->header->seqnum;
91b14d6ff362b938a72db17b095ee9903d07381bTom Gundersen } else
91b14d6ff362b938a72db17b095ee9903d07381bTom Gundersen h.seqnum_id = h.file_id;
87f5a19343acf8ba697acc5a62bdb1a2b8c9eda3Lennart Poettering
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering k = pwrite(f->fd, &h, sizeof(h), 0);
0b58db658b5c3f586ac3a837427f1f7fec2abb2eLennart Poettering if (k < 0)
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering return -errno;
6bb2c08597c999c429e889cd2403b2fef5f3e1a0Lennart Poettering
6bb2c08597c999c429e889cd2403b2fef5f3e1a0Lennart Poettering if (k != sizeof(h))
f4461e5641d53f27d6e76e0607bdaa9c0c58c1f6Lennart Poettering return -EIO;
6bb2c08597c999c429e889cd2403b2fef5f3e1a0Lennart Poettering
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering return 0;
de54e62b4bd7856fb897c9a2ee93cc228adb2135Lennart Poettering}
9df3ba6c6cb65eecec06f39dfe85a3596cedac4eTom Gundersen
f4461e5641d53f27d6e76e0607bdaa9c0c58c1f6Lennart Poetteringstatic int journal_file_refresh_header(JournalFile *f) {
f4461e5641d53f27d6e76e0607bdaa9c0c58c1f6Lennart Poettering int r;
519ef04651b07a547f010d6462603669d7fde4e5Lennart Poettering sd_id128_t boot_id;
519ef04651b07a547f010d6462603669d7fde4e5Lennart Poettering
6cb08a8930bdaca950b152b1e8b82466ed59511cLennart Poettering assert(f);
6cb08a8930bdaca950b152b1e8b82466ed59511cLennart Poettering
92ec902aad1ade7acbe50efd7b8ef87fbdc63af3Lennart Poettering r = sd_id128_get_machine(&f->header->machine_id);
92ec902aad1ade7acbe50efd7b8ef87fbdc63af3Lennart Poettering if (r < 0)
1e02e182f1e06fcbe389474175de228103be39cbLennart Poettering return r;
1e02e182f1e06fcbe389474175de228103be39cbLennart Poettering
4b95f1798f22c1bb75295f448188560cb6ec9eceLennart Poettering r = sd_id128_get_boot(&boot_id);
4b95f1798f22c1bb75295f448188560cb6ec9eceLennart Poettering if (r < 0)
4b95f1798f22c1bb75295f448188560cb6ec9eceLennart Poettering return r;
4b95f1798f22c1bb75295f448188560cb6ec9eceLennart Poettering
4b95f1798f22c1bb75295f448188560cb6ec9eceLennart Poettering if (sd_id128_equal(boot_id, f->header->boot_id))
f2f1dbe50fea13abadc9c1e845a29031b90b40f3Lennart Poettering f->tail_entry_monotonic_valid = true;
4b95f1798f22c1bb75295f448188560cb6ec9eceLennart Poettering
636e813dc98ea40c58c6c85bc5e7e3c9f0904ea2Lennart Poettering f->header->boot_id = boot_id;
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering f->header->state = STATE_ONLINE;
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering
0eac462399c8e87bcce252cf058eba9f2678f2bdLennart Poettering __sync_synchronize();
8300ba218e3cf5049496937be8bce10f22d09bbcTom Gundersen
8300ba218e3cf5049496937be8bce10f22d09bbcTom Gundersen return 0;
d5099efc47d4e6ac60816b5381a5f607ab03f06eMichal Schmidt}
static int journal_file_verify_header(JournalFile *f) {
assert(f);
if (memcmp(f->header, signature, 8))
return -EBADMSG;
#ifdef HAVE_XZ
if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
return -EPROTONOSUPPORT;
#else
if (f->header->incompatible_flags != 0)
return -EPROTONOSUPPORT;
#endif
if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
return -ENODATA;
if (f->writable) {
uint8_t state;
sd_id128_t machine_id;
int r;
r = sd_id128_get_machine(&machine_id);
if (r < 0)
return r;
if (!sd_id128_equal(machine_id, f->header->machine_id))
return -EHOSTDOWN;
state = f->header->state;
if (state == STATE_ONLINE)
log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
else if (state == STATE_ARCHIVED)
return -ESHUTDOWN;
else if (state != STATE_OFFLINE)
log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
}
return 0;
}
static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
uint64_t old_size, new_size;
assert(f);
/* We assume that this file is not sparse, and we know that
* for sure, since we always call posix_fallocate()
* ourselves */
old_size =
le64toh(f->header->arena_offset) +
le64toh(f->header->arena_size);
new_size = PAGE_ALIGN(offset + size);
if (new_size < le64toh(f->header->arena_offset))
new_size = le64toh(f->header->arena_offset);
if (new_size <= old_size)
return 0;
if (f->metrics.max_size > 0 &&
new_size > f->metrics.max_size)
return -E2BIG;
if (new_size > f->metrics.min_size &&
f->metrics.keep_free > 0) {
struct statvfs svfs;
if (fstatvfs(f->fd, &svfs) >= 0) {
uint64_t available;
available = svfs.f_bfree * svfs.f_bsize;
if (available >= f->metrics.keep_free)
available -= f->metrics.keep_free;
else
available = 0;
if (new_size - old_size > available)
return -E2BIG;
}
}
/* Note that the glibc fallocate() fallback is very
inefficient, hence we try to minimize the allocation area
as we can. */
if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
return -errno;
if (fstat(f->fd, &f->last_stat) < 0)
return -errno;
f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
return 0;
}
static int journal_file_map(
JournalFile *f,
uint64_t offset,
uint64_t size,
void **_window,
uint64_t *_woffset,
uint64_t *_wsize,
void **ret) {
uint64_t woffset, wsize;
void *window;
assert(f);
assert(size > 0);
assert(ret);
woffset = offset & ~((uint64_t) page_size() - 1ULL);
wsize = size + (offset - woffset);
wsize = PAGE_ALIGN(wsize);
/* Avoid SIGBUS on invalid accesses */
if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
return -EADDRNOTAVAIL;
window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
if (window == MAP_FAILED)
return -errno;
if (_window)
*_window = window;
if (_woffset)
*_woffset = woffset;
if (_wsize)
*_wsize = wsize;
*ret = (uint8_t*) window + (offset - woffset);
return 0;
}
static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
void *p = NULL;
uint64_t delta;
int r;
Window *w;
assert(f);
assert(ret);
assert(wt >= 0);
assert(wt < _WINDOW_MAX);
if (offset + size > (uint64_t) f->last_stat.st_size) {
/* Hmm, out of range? Let's refresh the fstat() data
* first, before we trust that check. */
if (fstat(f->fd, &f->last_stat) < 0 ||
offset + size > (uint64_t) f->last_stat.st_size)
return -EADDRNOTAVAIL;
}
w = f->windows + wt;
if (_likely_(w->ptr &&
w->offset <= offset &&
w->offset + w->size >= offset + size)) {
*ret = (uint8_t*) w->ptr + (offset - w->offset);
return 0;
}
if (w->ptr) {
if (munmap(w->ptr, w->size) < 0)
return -errno;
w->ptr = NULL;
w->size = w->offset = 0;
}
if (size < DEFAULT_WINDOW_SIZE) {
/* If the default window size is larger then what was
* asked for extend the mapping a bit in the hope to
* minimize needed remappings later on. We add half
* the window space before and half behind the
* requested mapping */
delta = (DEFAULT_WINDOW_SIZE - size) / 2;
if (delta > offset)
delta = offset;
offset -= delta;
size = DEFAULT_WINDOW_SIZE;
} else
delta = 0;
if (offset + size > (uint64_t) f->last_stat.st_size)
size = (uint64_t) f->last_stat.st_size - offset;
if (size <= 0)
return -EADDRNOTAVAIL;
r = journal_file_map(f,
offset, size,
&w->ptr, &w->offset, &w->size,
&p);
if (r < 0)
return r;
*ret = (uint8_t*) p + delta;
return 0;
}
static bool verify_hash(Object *o) {
uint64_t h1, h2;
assert(o);
if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
h1 = le64toh(o->data.hash);
h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
} else if (o->object.type == OBJECT_FIELD) {
h1 = le64toh(o->field.hash);
h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
} else
return true;
return h1 == h2;
}
int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
int r;
void *t;
Object *o;
uint64_t s;
assert(f);
assert(ret);
assert(type < _OBJECT_TYPE_MAX);
r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
if (r < 0)
return r;
o = (Object*) t;
s = le64toh(o->object.size);
if (s < sizeof(ObjectHeader))
return -EBADMSG;
if (type >= 0 && o->object.type != type)
return -EBADMSG;
if (s > sizeof(ObjectHeader)) {
r = journal_file_move_to(f, o->object.type, offset, s, &t);
if (r < 0)
return r;
o = (Object*) t;
}
if (!verify_hash(o))
return -EBADMSG;
*ret = o;
return 0;
}
static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
uint64_t r;
assert(f);
r = le64toh(f->header->seqnum) + 1;
if (seqnum) {
/* If an external seqnum counter was passed, we update
* both the local and the external one, and set it to
* the maximum of both */
if (*seqnum + 1 > r)
r = *seqnum + 1;
*seqnum = r;
}
f->header->seqnum = htole64(r);
if (f->header->first_seqnum == 0)
f->header->first_seqnum = htole64(r);
return r;
}
static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
int r;
uint64_t p;
Object *tail, *o;
void *t;
assert(f);
assert(size >= sizeof(ObjectHeader));
assert(offset);
assert(ret);
p = le64toh(f->header->tail_object_offset);
if (p == 0)
p = le64toh(f->header->arena_offset);
else {
r = journal_file_move_to_object(f, -1, p, &tail);
if (r < 0)
return r;
p += ALIGN64(le64toh(tail->object.size));
}
r = journal_file_allocate(f, p, size);
if (r < 0)
return r;
r = journal_file_move_to(f, type, p, size, &t);
if (r < 0)
return r;
o = (Object*) t;
zero(o->object);
o->object.type = type;
o->object.size = htole64(size);
f->header->tail_object_offset = htole64(p);
f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
*ret = o;
*offset = p;
return 0;
}
static int journal_file_setup_data_hash_table(JournalFile *f) {
uint64_t s, p;
Object *o;
int r;
assert(f);
s = DEFAULT_DATA_HASH_TABLE_SIZE;
r = journal_file_append_object(f,
OBJECT_DATA_HASH_TABLE,
offsetof(Object, hash_table.items) + s,
&o, &p);
if (r < 0)
return r;
memset(o->hash_table.items, 0, s);
f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
f->header->data_hash_table_size = htole64(s);
return 0;
}
static int journal_file_setup_field_hash_table(JournalFile *f) {
uint64_t s, p;
Object *o;
int r;
assert(f);
s = DEFAULT_FIELD_HASH_TABLE_SIZE;
r = journal_file_append_object(f,
OBJECT_FIELD_HASH_TABLE,
offsetof(Object, hash_table.items) + s,
&o, &p);
if (r < 0)
return r;
memset(o->hash_table.items, 0, s);
f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
f->header->field_hash_table_size = htole64(s);
return 0;
}
static int journal_file_map_data_hash_table(JournalFile *f) {
uint64_t s, p;
void *t;
int r;
assert(f);
p = le64toh(f->header->data_hash_table_offset);
s = le64toh(f->header->data_hash_table_size);
r = journal_file_move_to(f,
WINDOW_DATA_HASH_TABLE,
p, s,
&t);
if (r < 0)
return r;
f->data_hash_table = t;
return 0;
}
static int journal_file_map_field_hash_table(JournalFile *f) {
uint64_t s, p;
void *t;
int r;
assert(f);
p = le64toh(f->header->field_hash_table_offset);
s = le64toh(f->header->field_hash_table_size);
r = journal_file_move_to(f,
WINDOW_FIELD_HASH_TABLE,
p, s,
&t);
if (r < 0)
return r;
f->field_hash_table = t;
return 0;
}
static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
uint64_t p, h;
int r;
assert(f);
assert(o);
assert(offset > 0);
assert(o->object.type == OBJECT_DATA);
/* This might alter the window we are looking at */
o->data.next_hash_offset = o->data.next_field_offset = 0;
o->data.entry_offset = o->data.entry_array_offset = 0;
o->data.n_entries = 0;
h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
p = le64toh(f->data_hash_table[h].head_hash_offset);
if (p == 0) {
/* Only entry in the hash table is easy */
f->data_hash_table[h].head_hash_offset = htole64(offset);
} else {
/* Move back to the previous data object, to patch in
* pointer */
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
o->data.next_hash_offset = htole64(offset);
}
f->data_hash_table[h].tail_hash_offset = htole64(offset);
return 0;
}
int journal_file_find_data_object_with_hash(
JournalFile *f,
const void *data, uint64_t size, uint64_t hash,
Object **ret, uint64_t *offset) {
uint64_t p, osize, h;
int r;
assert(f);
assert(data || size == 0);
osize = offsetof(Object, data.payload) + size;
if (f->header->data_hash_table_size == 0)
return -EBADMSG;
h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
p = le64toh(f->data_hash_table[h].head_hash_offset);
while (p > 0) {
Object *o;
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
if (le64toh(o->data.hash) != hash)
goto next;
if (o->object.flags & OBJECT_COMPRESSED) {
#ifdef HAVE_XZ
uint64_t l, rsize;
l = le64toh(o->object.size);
if (l <= offsetof(Object, data.payload))
return -EBADMSG;
l -= offsetof(Object, data.payload);
if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
return -EBADMSG;
if (rsize == size &&
memcmp(f->compress_buffer, data, size) == 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
#else
return -EPROTONOSUPPORT;
#endif
} else if (le64toh(o->object.size) == osize &&
memcmp(o->data.payload, data, size) == 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
next:
p = le64toh(o->data.next_hash_offset);
}
return 0;
}
int journal_file_find_data_object(
JournalFile *f,
const void *data, uint64_t size,
Object **ret, uint64_t *offset) {
uint64_t hash;
assert(f);
assert(data || size == 0);
hash = hash64(data, size);
return journal_file_find_data_object_with_hash(f,
data, size, hash,
ret, offset);
}
static int journal_file_append_data(
JournalFile *f,
const void *data, uint64_t size,
Object **ret, uint64_t *offset) {
uint64_t hash, p;
uint64_t osize;
Object *o;
int r;
bool compressed = false;
assert(f);
assert(data || size == 0);
hash = hash64(data, size);
r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
if (r < 0)
return r;
else if (r > 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
osize = offsetof(Object, data.payload) + size;
r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
if (r < 0)
return r;
o->data.hash = htole64(hash);
#ifdef HAVE_XZ
if (f->compress &&
size >= COMPRESSION_SIZE_THRESHOLD) {
uint64_t rsize;
compressed = compress_blob(data, size, o->data.payload, &rsize);
if (compressed) {
o->object.size = htole64(offsetof(Object, data.payload) + rsize);
o->object.flags |= OBJECT_COMPRESSED;
f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
}
}
#endif
if (!compressed)
memcpy(o->data.payload, data, size);
r = journal_file_link_data(f, o, p, hash);
if (r < 0)
return r;
/* The linking might have altered the window, so let's
* refresh our pointer */
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
uint64_t journal_file_entry_n_items(Object *o) {
assert(o);
assert(o->object.type == OBJECT_ENTRY);
return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
}
static uint64_t journal_file_entry_array_n_items(Object *o) {
assert(o);
assert(o->object.type == OBJECT_ENTRY_ARRAY);
return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
}
static int link_entry_into_array(JournalFile *f,
uint64_t *first,
uint64_t *idx,
uint64_t p) {
int r;
uint64_t n = 0, ap = 0, q, i, a, hidx;
Object *o;
assert(f);
assert(first);
assert(idx);
assert(p > 0);
a = le64toh(*first);
i = hidx = le64toh(*idx);
while (a > 0) {
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
if (r < 0)
return r;
n = journal_file_entry_array_n_items(o);
if (i < n) {
o->entry_array.items[i] = htole64(p);
*idx = htole64(hidx + 1);
return 0;
}
i -= n;
ap = a;
a = le64toh(o->entry_array.next_entry_array_offset);
}
if (hidx > n)
n = (hidx+1) * 2;
else
n = n * 2;
if (n < 4)
n = 4;
r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
&o, &q);
if (r < 0)
return r;
o->entry_array.items[i] = htole64(p);
if (ap == 0)
*first = htole64(q);
else {
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
if (r < 0)
return r;
o->entry_array.next_entry_array_offset = htole64(q);
}
*idx = htole64(hidx + 1);
return 0;
}
static int link_entry_into_array_plus_one(JournalFile *f,
uint64_t *extra,
uint64_t *first,
uint64_t *idx,
uint64_t p) {
int r;
assert(f);
assert(extra);
assert(first);
assert(idx);
assert(p > 0);
if (*idx == 0)
*extra = htole64(p);
else {
uint64_t i;
i = htole64(le64toh(*idx) - 1);
r = link_entry_into_array(f, first, &i, p);
if (r < 0)
return r;
}
*idx = htole64(le64toh(*idx) + 1);
return 0;
}
static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
uint64_t p;
int r;
assert(f);
assert(o);
assert(offset > 0);
p = le64toh(o->entry.items[i].object_offset);
if (p == 0)
return -EINVAL;
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
return link_entry_into_array_plus_one(f,
&o->data.entry_offset,
&o->data.entry_array_offset,
&o->data.n_entries,
offset);
}
static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
uint64_t n, i;
int r;
assert(f);
assert(o);
assert(offset > 0);
assert(o->object.type == OBJECT_ENTRY);
__sync_synchronize();
/* Link up the entry itself */
r = link_entry_into_array(f,
&f->header->entry_array_offset,
&f->header->n_entries,
offset);
if (r < 0)
return r;
/* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
if (f->header->head_entry_realtime == 0)
f->header->head_entry_realtime = o->entry.realtime;
f->header->tail_entry_realtime = o->entry.realtime;
f->header->tail_entry_monotonic = o->entry.monotonic;
f->tail_entry_monotonic_valid = true;
/* Link up the items */
n = journal_file_entry_n_items(o);
for (i = 0; i < n; i++) {
r = journal_file_link_entry_item(f, o, offset, i);
if (r < 0)
return r;
}
return 0;
}
static int journal_file_append_entry_internal(
JournalFile *f,
const dual_timestamp *ts,
uint64_t xor_hash,
const EntryItem items[], unsigned n_items,
uint64_t *seqnum,
Object **ret, uint64_t *offset) {
uint64_t np;
uint64_t osize;
Object *o;
int r;
assert(f);
assert(items || n_items == 0);
assert(ts);
osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
if (r < 0)
return r;
o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
o->entry.realtime = htole64(ts->realtime);
o->entry.monotonic = htole64(ts->monotonic);
o->entry.xor_hash = htole64(xor_hash);
o->entry.boot_id = f->header->boot_id;
r = journal_file_link_entry(f, o, np);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = np;
return 0;
}
void journal_file_post_change(JournalFile *f) {
assert(f);
/* inotify() does not receive IN_MODIFY events from file
* accesses done via mmap(). After each access we hence
* trigger IN_MODIFY by truncating the journal file to its
* current size which triggers IN_MODIFY. */
__sync_synchronize();
if (ftruncate(f->fd, f->last_stat.st_size) < 0)
log_error("Failed to to truncate file to its own size: %m");
}
int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
unsigned i;
EntryItem *items;
int r;
uint64_t xor_hash = 0;
struct dual_timestamp _ts;
assert(f);
assert(iovec || n_iovec == 0);
if (!f->writable)
return -EPERM;
if (!ts) {
dual_timestamp_get(&_ts);
ts = &_ts;
}
if (f->tail_entry_monotonic_valid &&
ts->monotonic < le64toh(f->header->tail_entry_monotonic))
return -EINVAL;
items = alloca(sizeof(EntryItem) * n_iovec);
for (i = 0; i < n_iovec; i++) {
uint64_t p;
Object *o;
r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
if (r < 0)
return r;
xor_hash ^= le64toh(o->data.hash);
items[i].object_offset = htole64(p);
items[i].hash = o->data.hash;
}
r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
journal_file_post_change(f);
return r;
}
static int generic_array_get(JournalFile *f,
uint64_t first,
uint64_t i,
Object **ret, uint64_t *offset) {
Object *o;
uint64_t p = 0, a;
int r;
assert(f);
a = first;
while (a > 0) {
uint64_t n;
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
if (r < 0)
return r;
n = journal_file_entry_array_n_items(o);
if (i < n) {
p = le64toh(o->entry_array.items[i]);
break;
}
i -= n;
a = le64toh(o->entry_array.next_entry_array_offset);
}
if (a <= 0 || p <= 0)
return 0;
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
static int generic_array_get_plus_one(JournalFile *f,
uint64_t extra,
uint64_t first,
uint64_t i,
Object **ret, uint64_t *offset) {
Object *o;
assert(f);
if (i == 0) {
int r;
r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = extra;
return 1;
}
return generic_array_get(f, first, i-1, ret, offset);
}
enum {
TEST_FOUND,
TEST_LEFT,
TEST_RIGHT
};
static int generic_array_bisect(JournalFile *f,
uint64_t first,
uint64_t n,
uint64_t needle,
int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
direction_t direction,
Object **ret,
uint64_t *offset,
uint64_t *idx) {
uint64_t a, p, t = 0, i = 0, last_p = 0;
bool subtract_one = false;
Object *o, *array = NULL;
int r;
assert(f);
assert(test_object);
a = first;
while (a > 0) {
uint64_t left, right, k, lp;
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
if (r < 0)
return r;
k = journal_file_entry_array_n_items(array);
right = MIN(k, n);
if (right <= 0)
return 0;
i = right - 1;
lp = p = le64toh(array->entry_array.items[i]);
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
if (r == TEST_RIGHT) {
left = 0;
right -= 1;
for (;;) {
if (left == right) {
if (direction == DIRECTION_UP)
subtract_one = true;
i = left;
goto found;
}
assert(left < right);
i = (left + right) / 2;
p = le64toh(array->entry_array.items[i]);
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
if (r == TEST_RIGHT)
right = i;
else
left = i + 1;
}
}
if (k > n)
return 0;
last_p = lp;
n -= k;
t += k;
a = le64toh(array->entry_array.next_entry_array_offset);
}
return 0;
found:
if (subtract_one && t == 0 && i == 0)
return 0;
if (subtract_one && i == 0)
p = last_p;
else if (subtract_one)
p = le64toh(array->entry_array.items[i-1]);
else
p = le64toh(array->entry_array.items[i]);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
if (idx)
*idx = t + i - (subtract_one ? 1 : 0);
return 1;
}
static int generic_array_bisect_plus_one(JournalFile *f,
uint64_t extra,
uint64_t first,
uint64_t n,
uint64_t needle,
int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
direction_t direction,
Object **ret,
uint64_t *offset,
uint64_t *idx) {
int r;
assert(f);
assert(test_object);
if (n <= 0)
return 0;
/* This bisects the array in object 'first', but first checks
* an extra */
r = test_object(f, extra, needle);
if (r < 0)
return r;
else if (r == TEST_FOUND) {
Object *o;
r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = extra;
if (idx)
*idx = 0;
return 1;
} else if (r == TEST_RIGHT)
return 0;
r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
if (r > 0)
(*idx) ++;
return r;
}
static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (le64toh(o->entry.seqnum) == needle)
return TEST_FOUND;
else if (le64toh(o->entry.seqnum) < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
int journal_file_move_to_entry_by_seqnum(
JournalFile *f,
uint64_t seqnum,
direction_t direction,
Object **ret,
uint64_t *offset) {
return generic_array_bisect(f,
le64toh(f->header->entry_array_offset),
le64toh(f->header->n_entries),
seqnum,
test_object_seqnum,
direction,
ret, offset, NULL);
}
static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (le64toh(o->entry.realtime) == needle)
return TEST_FOUND;
else if (le64toh(o->entry.realtime) < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
int journal_file_move_to_entry_by_realtime(
JournalFile *f,
uint64_t realtime,
direction_t direction,
Object **ret,
uint64_t *offset) {
return generic_array_bisect(f,
le64toh(f->header->entry_array_offset),
le64toh(f->header->n_entries),
realtime,
test_object_realtime,
direction,
ret, offset, NULL);
}
static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (le64toh(o->entry.monotonic) == needle)
return TEST_FOUND;
else if (le64toh(o->entry.monotonic) < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
int journal_file_move_to_entry_by_monotonic(
JournalFile *f,
sd_id128_t boot_id,
uint64_t monotonic,
direction_t direction,
Object **ret,
uint64_t *offset) {
char t[8+32+1] = "_BOOT_ID=";
Object *o;
int r;
sd_id128_to_string(boot_id, t + 8);
r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
if (r < 0)
return r;
else if (r == 0)
return -ENOENT;
return generic_array_bisect_plus_one(f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries),
monotonic,
test_object_monotonic,
direction,
ret, offset, NULL);
}
static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
assert(f);
assert(p > 0);
if (p == needle)
return TEST_FOUND;
else if (p < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
int journal_file_next_entry(
JournalFile *f,
Object *o, uint64_t p,
direction_t direction,
Object **ret, uint64_t *offset) {
uint64_t i, n;
int r;
assert(f);
assert(p > 0 || !o);
n = le64toh(f->header->n_entries);
if (n <= 0)
return 0;
if (!o)
i = direction == DIRECTION_DOWN ? 0 : n - 1;
else {
if (o->object.type != OBJECT_ENTRY)
return -EINVAL;
r = generic_array_bisect(f,
le64toh(f->header->entry_array_offset),
le64toh(f->header->n_entries),
p,
test_object_offset,
DIRECTION_DOWN,
NULL, NULL,
&i);
if (r <= 0)
return r;
if (direction == DIRECTION_DOWN) {
if (i >= n - 1)
return 0;
i++;
} else {
if (i <= 0)
return 0;
i--;
}
}
/* And jump to it */
return generic_array_get(f,
le64toh(f->header->entry_array_offset),
i,
ret, offset);
}
int journal_file_skip_entry(
JournalFile *f,
Object *o, uint64_t p,
int64_t skip,
Object **ret, uint64_t *offset) {
uint64_t i, n;
int r;
assert(f);
assert(o);
assert(p > 0);
if (o->object.type != OBJECT_ENTRY)
return -EINVAL;
r = generic_array_bisect(f,
le64toh(f->header->entry_array_offset),
le64toh(f->header->n_entries),
p,
test_object_offset,
DIRECTION_DOWN,
NULL, NULL,
&i);
if (r <= 0)
return r;
/* Calculate new index */
if (skip < 0) {
if ((uint64_t) -skip >= i)
i = 0;
else
i = i - (uint64_t) -skip;
} else
i += (uint64_t) skip;
n = le64toh(f->header->n_entries);
if (n <= 0)
return -EBADMSG;
if (i >= n)
i = n-1;
return generic_array_get(f,
le64toh(f->header->entry_array_offset),
i,
ret, offset);
}
int journal_file_next_entry_for_data(
JournalFile *f,
Object *o, uint64_t p,
uint64_t data_offset,
direction_t direction,
Object **ret, uint64_t *offset) {
uint64_t n, i;
int r;
Object *d;
assert(f);
assert(p > 0 || !o);
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
if (r < 0)
return r;
n = le64toh(d->data.n_entries);
if (n <= 0)
return n;
if (!o)
i = direction == DIRECTION_DOWN ? 0 : n - 1;
else {
if (o->object.type != OBJECT_ENTRY)
return -EINVAL;
r = generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
p,
test_object_offset,
DIRECTION_DOWN,
NULL, NULL,
&i);
if (r <= 0)
return r;
if (direction == DIRECTION_DOWN) {
if (i >= n - 1)
return 0;
i++;
} else {
if (i <= 0)
return 0;
i--;
}
}
return generic_array_get_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
i,
ret, offset);
}
int journal_file_move_to_entry_by_seqnum_for_data(
JournalFile *f,
uint64_t data_offset,
uint64_t seqnum,
direction_t direction,
Object **ret, uint64_t *offset) {
Object *d;
int r;
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
if (r <= 0)
return r;
return generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
seqnum,
test_object_seqnum,
direction,
ret, offset, NULL);
}
int journal_file_move_to_entry_by_realtime_for_data(
JournalFile *f,
uint64_t data_offset,
uint64_t realtime,
direction_t direction,
Object **ret, uint64_t *offset) {
Object *d;
int r;
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
if (r <= 0)
return r;
return generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
realtime,
test_object_realtime,
direction,
ret, offset, NULL);
}
void journal_file_dump(JournalFile *f) {
char a[33], b[33], c[33];
Object *o;
int r;
uint64_t p;
assert(f);
printf("File Path: %s\n"
"File ID: %s\n"
"Machine ID: %s\n"
"Boot ID: %s\n"
"Arena size: %llu\n"
"Objects: %lu\n"
"Entries: %lu\n",
f->path,
sd_id128_to_string(f->header->file_id, a),
sd_id128_to_string(f->header->machine_id, b),
sd_id128_to_string(f->header->boot_id, c),
(unsigned long long) le64toh(f->header->arena_size),
(unsigned long) le64toh(f->header->n_objects),
(unsigned long) le64toh(f->header->n_entries));
p = le64toh(f->header->arena_offset);
while (p != 0) {
r = journal_file_move_to_object(f, -1, p, &o);
if (r < 0)
goto fail;
switch (o->object.type) {
case OBJECT_UNUSED:
printf("Type: OBJECT_UNUSED\n");
break;
case OBJECT_DATA:
printf("Type: OBJECT_DATA\n");
break;
case OBJECT_ENTRY:
printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
(unsigned long long) le64toh(o->entry.seqnum),
(unsigned long long) le64toh(o->entry.monotonic),
(unsigned long long) le64toh(o->entry.realtime));
break;
case OBJECT_FIELD_HASH_TABLE:
printf("Type: OBJECT_FIELD_HASH_TABLE\n");
break;
case OBJECT_DATA_HASH_TABLE:
printf("Type: OBJECT_DATA_HASH_TABLE\n");
break;
case OBJECT_ENTRY_ARRAY:
printf("Type: OBJECT_ENTRY_ARRAY\n");
break;
}
if (o->object.flags & OBJECT_COMPRESSED)
printf("Flags: COMPRESSED\n");
if (p == le64toh(f->header->tail_object_offset))
p = 0;
else
p = p + ALIGN64(le64toh(o->object.size));
}
return;
fail:
log_error("File corrupt");
}
int journal_file_open(
const char *fname,
int flags,
mode_t mode,
JournalFile *template,
JournalFile **ret) {
JournalFile *f;
int r;
bool newly_created = false;
assert(fname);
if ((flags & O_ACCMODE) != O_RDONLY &&
(flags & O_ACCMODE) != O_RDWR)
return -EINVAL;
if (!endswith(fname, ".journal"))
return -EINVAL;
f = new0(JournalFile, 1);
if (!f)
return -ENOMEM;
f->fd = -1;
f->flags = flags;
f->mode = mode;
f->writable = (flags & O_ACCMODE) != O_RDONLY;
f->prot = prot_from_flags(flags);
if (template) {
f->metrics = template->metrics;
f->compress = template->compress;
}
f->path = strdup(fname);
if (!f->path) {
r = -ENOMEM;
goto fail;
}
f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
if (f->fd < 0) {
r = -errno;
goto fail;
}
if (fstat(f->fd, &f->last_stat) < 0) {
r = -errno;
goto fail;
}
if (f->last_stat.st_size == 0 && f->writable) {
newly_created = true;
r = journal_file_init_header(f, template);
if (r < 0)
goto fail;
if (fstat(f->fd, &f->last_stat) < 0) {
r = -errno;
goto fail;
}
}
if (f->last_stat.st_size < (off_t) sizeof(Header)) {
r = -EIO;
goto fail;
}
f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
if (f->header == MAP_FAILED) {
f->header = NULL;
r = -errno;
goto fail;
}
if (!newly_created) {
r = journal_file_verify_header(f);
if (r < 0)
goto fail;
}
if (f->writable) {
r = journal_file_refresh_header(f);
if (r < 0)
goto fail;
}
if (newly_created) {
r = journal_file_setup_field_hash_table(f);
if (r < 0)
goto fail;
r = journal_file_setup_data_hash_table(f);
if (r < 0)
goto fail;
}
r = journal_file_map_field_hash_table(f);
if (r < 0)
goto fail;
r = journal_file_map_data_hash_table(f);
if (r < 0)
goto fail;
if (ret)
*ret = f;
return 0;
fail:
journal_file_close(f);
return r;
}
int journal_file_rotate(JournalFile **f) {
char *p;
size_t l;
JournalFile *old_file, *new_file = NULL;
int r;
assert(f);
assert(*f);
old_file = *f;
if (!old_file->writable)
return -EINVAL;
if (!endswith(old_file->path, ".journal"))
return -EINVAL;
l = strlen(old_file->path);
p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
if (!p)
return -ENOMEM;
memcpy(p, old_file->path, l - 8);
p[l-8] = '@';
sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
"-%016llx-%016llx.journal",
(unsigned long long) le64toh((*f)->header->seqnum),
(unsigned long long) le64toh((*f)->header->tail_entry_realtime));
r = rename(old_file->path, p);
free(p);
if (r < 0)
return -errno;
old_file->header->state = STATE_ARCHIVED;
r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
journal_file_close(old_file);
*f = new_file;
return r;
}
int journal_file_open_reliably(
const char *fname,
int flags,
mode_t mode,
JournalFile *template,
JournalFile **ret) {
int r;
size_t l;
char *p;
r = journal_file_open(fname, flags, mode, template, ret);
if (r != -EBADMSG)
return r;
if ((flags & O_ACCMODE) == O_RDONLY)
return r;
if (!(flags & O_CREAT))
return r;
/* The file is corrupted. Rotate it away and try it again (but only once) */
l = strlen(fname);
if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
(int) (l-8), fname,
(unsigned long long) now(CLOCK_REALTIME),
random_ull()) < 0)
return -ENOMEM;
r = rename(fname, p);
free(p);
if (r < 0)
return -errno;
log_warning("File %s corrupted, renaming and replacing.", fname);
return journal_file_open(fname, flags, mode, template, ret);
}
struct vacuum_info {
off_t usage;
char *filename;
uint64_t realtime;
sd_id128_t seqnum_id;
uint64_t seqnum;
bool have_seqnum;
};
static int vacuum_compare(const void *_a, const void *_b) {
const struct vacuum_info *a, *b;
a = _a;
b = _b;
if (a->have_seqnum && b->have_seqnum &&
sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
if (a->seqnum < b->seqnum)
return -1;
else if (a->seqnum > b->seqnum)
return 1;
else
return 0;
}
if (a->realtime < b->realtime)
return -1;
else if (a->realtime > b->realtime)
return 1;
else if (a->have_seqnum && b->have_seqnum)
return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
else
return strcmp(a->filename, b->filename);
}
int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
DIR *d;
int r = 0;
struct vacuum_info *list = NULL;
unsigned n_list = 0, n_allocated = 0, i;
uint64_t sum = 0;
assert(directory);
if (max_use <= 0)
return 0;
d = opendir(directory);
if (!d)
return -errno;
for (;;) {
int k;
struct dirent buf, *de;
size_t q;
struct stat st;
char *p;
unsigned long long seqnum, realtime;
sd_id128_t seqnum_id;
bool have_seqnum;
k = readdir_r(d, &buf, &de);
if (k != 0) {
r = -k;
goto finish;
}
if (!de)
break;
if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
continue;
if (!S_ISREG(st.st_mode))
continue;
q = strlen(de->d_name);
if (endswith(de->d_name, ".journal")) {
/* Vacuum archived files */
if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
continue;
if (de->d_name[q-8-16-1] != '-' ||
de->d_name[q-8-16-1-16-1] != '-' ||
de->d_name[q-8-16-1-16-1-32-1] != '@')
continue;
p = strdup(de->d_name);
if (!p) {
r = -ENOMEM;
goto finish;
}
de->d_name[q-8-16-1-16-1] = 0;
if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
free(p);
continue;
}
if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
free(p);
continue;
}
have_seqnum = true;
} else if (endswith(de->d_name, ".journal~")) {
unsigned long long tmp;
/* Vacuum corrupted files */
if (q < 1 + 16 + 1 + 16 + 8 + 1)
continue;
if (de->d_name[q-1-8-16-1] != '-' ||
de->d_name[q-1-8-16-1-16-1] != '@')
continue;
p = strdup(de->d_name);
if (!p) {
r = -ENOMEM;
goto finish;
}
if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
free(p);
continue;
}
have_seqnum = false;
} else
continue;
if (n_list >= n_allocated) {
struct vacuum_info *j;
n_allocated = MAX(n_allocated * 2U, 8U);
j = realloc(list, n_allocated * sizeof(struct vacuum_info));
if (!j) {
free(p);
r = -ENOMEM;
goto finish;
}
list = j;
}
list[n_list].filename = p;
list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
list[n_list].seqnum = seqnum;
list[n_list].realtime = realtime;
list[n_list].seqnum_id = seqnum_id;
list[n_list].have_seqnum = have_seqnum;
sum += list[n_list].usage;
n_list ++;
}
qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
for(i = 0; i < n_list; i++) {
struct statvfs ss;
if (fstatvfs(dirfd(d), &ss) < 0) {
r = -errno;
goto finish;
}
if (sum <= max_use &&
(uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
break;
if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
sum -= list[i].usage;
} else if (errno != ENOENT)
log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
}
finish:
for (i = 0; i < n_list; i++)
free(list[i].filename);
free(list);
if (d)
closedir(d);
return r;
}
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
uint64_t i, n;
uint64_t q, xor_hash = 0;
int r;
EntryItem *items;
dual_timestamp ts;
assert(from);
assert(to);
assert(o);
assert(p);
if (!to->writable)
return -EPERM;
ts.monotonic = le64toh(o->entry.monotonic);
ts.realtime = le64toh(o->entry.realtime);
if (to->tail_entry_monotonic_valid &&
ts.monotonic < le64toh(to->header->tail_entry_monotonic))
return -EINVAL;
if (ts.realtime < le64toh(to->header->tail_entry_realtime))
return -EINVAL;
n = journal_file_entry_n_items(o);
items = alloca(sizeof(EntryItem) * n);
for (i = 0; i < n; i++) {
uint64_t le_hash, l, h;
size_t t;
void *data;
Object *u;
q = le64toh(o->entry.items[i].object_offset);
le_hash = o->entry.items[i].hash;
r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
if (r < 0)
return r;
if (le_hash != o->data.hash)
return -EBADMSG;
l = le64toh(o->object.size) - offsetof(Object, data.payload);
t = (size_t) l;
/* We hit the limit on 32bit machines */
if ((uint64_t) t != l)
return -E2BIG;
if (o->object.flags & OBJECT_COMPRESSED) {
#ifdef HAVE_XZ
uint64_t rsize;
if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
return -EBADMSG;
data = from->compress_buffer;
l = rsize;
#else
return -EPROTONOSUPPORT;
#endif
} else
data = o->data.payload;
r = journal_file_append_data(to, data, l, &u, &h);
if (r < 0)
return r;
xor_hash ^= le64toh(u->data.hash);
items[i].object_offset = htole64(h);
items[i].hash = u->data.hash;
r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
}
return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
}
void journal_default_metrics(JournalMetrics *m, int fd) {
uint64_t fs_size = 0;
struct statvfs ss;
char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
assert(m);
assert(fd >= 0);
if (fstatvfs(fd, &ss) >= 0)
fs_size = ss.f_frsize * ss.f_blocks;
if (m->max_use == (uint64_t) -1) {
if (fs_size > 0) {
m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
if (m->max_use > DEFAULT_MAX_USE_UPPER)
m->max_use = DEFAULT_MAX_USE_UPPER;
if (m->max_use < DEFAULT_MAX_USE_LOWER)
m->max_use = DEFAULT_MAX_USE_LOWER;
} else
m->max_use = DEFAULT_MAX_USE_LOWER;
} else {
m->max_use = PAGE_ALIGN(m->max_use);
if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
m->max_use = JOURNAL_FILE_SIZE_MIN*2;
}
if (m->max_size == (uint64_t) -1) {
m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
m->max_size = DEFAULT_MAX_SIZE_UPPER;
} else
m->max_size = PAGE_ALIGN(m->max_size);
if (m->max_size < JOURNAL_FILE_SIZE_MIN)
m->max_size = JOURNAL_FILE_SIZE_MIN;
if (m->max_size*2 > m->max_use)
m->max_use = m->max_size*2;
if (m->min_size == (uint64_t) -1)
m->min_size = JOURNAL_FILE_SIZE_MIN;
else {
m->min_size = PAGE_ALIGN(m->min_size);
if (m->min_size < JOURNAL_FILE_SIZE_MIN)
m->min_size = JOURNAL_FILE_SIZE_MIN;
if (m->min_size > m->max_size)
m->max_size = m->min_size;
}
if (m->keep_free == (uint64_t) -1) {
if (fs_size > 0) {
m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
m->keep_free = DEFAULT_KEEP_FREE_UPPER;
} else
m->keep_free = DEFAULT_KEEP_FREE;
}
log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
format_bytes(a, sizeof(a), m->max_use),
format_bytes(b, sizeof(b), m->max_size),
format_bytes(c, sizeof(c), m->min_size),
format_bytes(d, sizeof(d), m->keep_free));
}