Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <errno.h>
4 : #include <fcntl.h>
5 : #include <linux/fs.h>
6 : #include <pthread.h>
7 : #include <stddef.h>
8 : #include <sys/mman.h>
9 : #include <sys/statvfs.h>
10 : #include <sys/uio.h>
11 : #include <unistd.h>
12 :
13 : #include "sd-event.h"
14 :
15 : #include "alloc-util.h"
16 : #include "btrfs-util.h"
17 : #include "chattr-util.h"
18 : #include "compress.h"
19 : #include "fd-util.h"
20 : #include "format-util.h"
21 : #include "fs-util.h"
22 : #include "journal-authenticate.h"
23 : #include "journal-def.h"
24 : #include "journal-file.h"
25 : #include "lookup3.h"
26 : #include "memory-util.h"
27 : #include "path-util.h"
28 : #include "random-util.h"
29 : #include "set.h"
30 : #include "sort-util.h"
31 : #include "stat-util.h"
32 : #include "string-util.h"
33 : #include "strv.h"
34 : #include "xattr-util.h"
35 :
36 : #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 : #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
38 :
39 : #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
40 : #define MIN_COMPRESS_THRESHOLD (8ULL)
41 :
42 : /* This is the minimum journal file size */
43 : #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
44 :
45 : /* These are the lower and upper bounds if we deduce the max_use value
46 : * from the file system size */
47 : #define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
48 : #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
49 :
50 : /* Those are the lower and upper bounds for the minimal use limit,
51 : * i.e. how much we'll use even if keep_free suggests otherwise. */
52 : #define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
53 : #define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
54 :
55 : /* This is the upper bound if we deduce max_size from max_use */
56 : #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
57 :
58 : /* This is the upper bound if we deduce the keep_free value from the
59 : * file system size */
60 : #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
61 :
62 : /* This is the keep_free value when we can't determine the system
63 : * size */
64 : #define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
65 :
66 : /* This is the default maximum number of journal files to keep around. */
67 : #define DEFAULT_N_MAX_FILES 100
68 :
69 : /* n_data was the first entry we added after the initial file format design */
70 : #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
71 :
72 : /* How many entries to keep in the entry array chain cache at max */
73 : #define CHAIN_CACHE_MAX 20
74 :
75 : /* How much to increase the journal file size at once each time we allocate something new. */
76 : #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
77 :
78 : /* Reread fstat() of the file for detecting deletions at least this often */
79 : #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80 :
81 : /* The mmap context to use for the header we pick as one above the last defined typed */
82 : #define CONTEXT_HEADER _OBJECT_TYPE_MAX
83 :
84 : #ifdef __clang__
85 : # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
86 : #endif
87 :
88 : /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
89 : * As a result we use atomic operations on f->offline_state for inter-thread communications with
90 : * journal_file_set_offline() and journal_file_set_online(). */
91 26 : static void journal_file_set_offline_internal(JournalFile *f) {
92 26 : assert(f);
93 26 : assert(f->fd >= 0);
94 26 : assert(f->header);
95 :
96 : for (;;) {
97 52 : switch (f->offline_state) {
98 0 : case OFFLINE_CANCEL:
99 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
100 0 : continue;
101 0 : return;
102 :
103 0 : case OFFLINE_AGAIN_FROM_SYNCING:
104 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
105 0 : continue;
106 0 : break;
107 :
108 0 : case OFFLINE_AGAIN_FROM_OFFLINING:
109 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
110 0 : continue;
111 0 : break;
112 :
113 26 : case OFFLINE_SYNCING:
114 26 : (void) fsync(f->fd);
115 :
116 26 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
117 0 : continue;
118 :
119 26 : f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
120 26 : (void) fsync(f->fd);
121 26 : break;
122 :
123 26 : case OFFLINE_OFFLINING:
124 26 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
125 0 : continue;
126 : _fallthrough_;
127 : case OFFLINE_DONE:
128 26 : return;
129 :
130 0 : case OFFLINE_JOINED:
131 0 : log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
132 0 : return;
133 : }
134 26 : }
135 : }
136 :
137 0 : static void * journal_file_set_offline_thread(void *arg) {
138 0 : JournalFile *f = arg;
139 :
140 0 : (void) pthread_setname_np(pthread_self(), "journal-offline");
141 :
142 0 : journal_file_set_offline_internal(f);
143 :
144 0 : return NULL;
145 : }
146 :
147 26 : static int journal_file_set_offline_thread_join(JournalFile *f) {
148 : int r;
149 :
150 26 : assert(f);
151 :
152 26 : if (f->offline_state == OFFLINE_JOINED)
153 26 : return 0;
154 :
155 0 : r = pthread_join(f->offline_thread, NULL);
156 0 : if (r)
157 0 : return -r;
158 :
159 0 : f->offline_state = OFFLINE_JOINED;
160 :
161 0 : if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
162 0 : return -EIO;
163 :
164 0 : return 0;
165 : }
166 :
167 : /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
168 26 : static bool journal_file_set_offline_try_restart(JournalFile *f) {
169 : for (;;) {
170 26 : switch (f->offline_state) {
171 0 : case OFFLINE_AGAIN_FROM_SYNCING:
172 : case OFFLINE_AGAIN_FROM_OFFLINING:
173 0 : return true;
174 :
175 0 : case OFFLINE_CANCEL:
176 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
177 0 : continue;
178 0 : return true;
179 :
180 0 : case OFFLINE_SYNCING:
181 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
182 0 : continue;
183 0 : return true;
184 :
185 0 : case OFFLINE_OFFLINING:
186 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
187 0 : continue;
188 0 : return true;
189 :
190 26 : default:
191 26 : return false;
192 : }
193 : }
194 : }
195 :
196 : /* Sets a journal offline.
197 : *
198 : * If wait is false then an offline is dispatched in a separate thread for a
199 : * subsequent journal_file_set_offline() or journal_file_set_online() of the
200 : * same journal to synchronize with.
201 : *
202 : * If wait is true, then either an existing offline thread will be restarted
203 : * and joined, or if none exists the offline is simply performed in this
204 : * context without involving another thread.
205 : */
206 9843 : int journal_file_set_offline(JournalFile *f, bool wait) {
207 : bool restarted;
208 : int r;
209 :
210 9843 : assert(f);
211 :
212 9843 : if (!f->writable)
213 9817 : return -EPERM;
214 :
215 26 : if (f->fd < 0 || !f->header)
216 0 : return -EINVAL;
217 :
218 : /* An offlining journal is implicitly online and may modify f->header->state,
219 : * we must also join any potentially lingering offline thread when not online. */
220 26 : if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
221 0 : return journal_file_set_offline_thread_join(f);
222 :
223 : /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
224 26 : restarted = journal_file_set_offline_try_restart(f);
225 26 : if ((restarted && wait) || !restarted) {
226 26 : r = journal_file_set_offline_thread_join(f);
227 26 : if (r < 0)
228 0 : return r;
229 : }
230 :
231 26 : if (restarted)
232 0 : return 0;
233 :
234 : /* Initiate a new offline. */
235 26 : f->offline_state = OFFLINE_SYNCING;
236 :
237 26 : if (wait) /* Without using a thread if waiting. */
238 26 : journal_file_set_offline_internal(f);
239 : else {
240 : sigset_t ss, saved_ss;
241 : int k;
242 :
243 0 : assert_se(sigfillset(&ss) >= 0);
244 : /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
245 : * Asynchronous SIGBUS signals can safely be handled by either thread. */
246 0 : assert_se(sigdelset(&ss, SIGBUS) >= 0);
247 :
248 0 : r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
249 0 : if (r > 0)
250 0 : return -r;
251 :
252 0 : r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
253 :
254 0 : k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
255 0 : if (r > 0) {
256 0 : f->offline_state = OFFLINE_JOINED;
257 0 : return -r;
258 : }
259 0 : if (k > 0)
260 0 : return -k;
261 : }
262 :
263 26 : return 0;
264 : }
265 :
266 51131 : static int journal_file_set_online(JournalFile *f) {
267 51131 : bool wait = true;
268 :
269 51131 : assert(f);
270 :
271 51131 : if (!f->writable)
272 0 : return -EPERM;
273 :
274 51131 : if (f->fd < 0 || !f->header)
275 0 : return -EINVAL;
276 :
277 102262 : while (wait) {
278 51131 : switch (f->offline_state) {
279 51131 : case OFFLINE_JOINED:
280 : /* No offline thread, no need to wait. */
281 51131 : wait = false;
282 51131 : break;
283 :
284 0 : case OFFLINE_SYNCING:
285 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
286 0 : continue;
287 : /* Canceled syncing prior to offlining, no need to wait. */
288 0 : wait = false;
289 0 : break;
290 :
291 0 : case OFFLINE_AGAIN_FROM_SYNCING:
292 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
293 0 : continue;
294 : /* Canceled restart from syncing, no need to wait. */
295 0 : wait = false;
296 0 : break;
297 :
298 0 : case OFFLINE_AGAIN_FROM_OFFLINING:
299 0 : if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
300 0 : continue;
301 : /* Canceled restart from offlining, must wait for offlining to complete however. */
302 : _fallthrough_;
303 : default: {
304 : int r;
305 :
306 0 : r = journal_file_set_offline_thread_join(f);
307 0 : if (r < 0)
308 0 : return r;
309 :
310 0 : wait = false;
311 0 : break;
312 : }
313 : }
314 : }
315 :
316 51131 : if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
317 0 : return -EIO;
318 :
319 51131 : switch (f->header->state) {
320 51105 : case STATE_ONLINE:
321 51105 : return 0;
322 :
323 26 : case STATE_OFFLINE:
324 26 : f->header->state = STATE_ONLINE;
325 26 : (void) fsync(f->fd);
326 26 : return 0;
327 :
328 0 : default:
329 0 : return -EINVAL;
330 : }
331 : }
332 :
333 26 : bool journal_file_is_offlining(JournalFile *f) {
334 26 : assert(f);
335 :
336 26 : __sync_synchronize();
337 :
338 26 : if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
339 26 : return false;
340 :
341 0 : return true;
342 : }
343 :
344 9843 : JournalFile* journal_file_close(JournalFile *f) {
345 9843 : if (!f)
346 0 : return NULL;
347 :
348 : #if HAVE_GCRYPT
349 : /* Write the final tag */
350 9843 : if (f->seal && f->writable) {
351 : int r;
352 :
353 0 : r = journal_file_append_tag(f);
354 0 : if (r < 0)
355 0 : log_error_errno(r, "Failed to append tag when closing journal: %m");
356 : }
357 : #endif
358 :
359 9843 : if (f->post_change_timer) {
360 0 : if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
361 0 : journal_file_post_change(f);
362 :
363 0 : sd_event_source_disable_unref(f->post_change_timer);
364 : }
365 :
366 9843 : journal_file_set_offline(f, true);
367 :
368 9843 : if (f->mmap && f->cache_fd)
369 9843 : mmap_cache_free_fd(f->mmap, f->cache_fd);
370 :
371 9843 : if (f->fd >= 0 && f->defrag_on_close) {
372 :
373 : /* Be friendly to btrfs: turn COW back on again now,
374 : * and defragment the file. We won't write to the file
375 : * ever again, hence remove all fragmentation, and
376 : * reenable all the good bits COW usually provides
377 : * (such as data checksumming). */
378 :
379 2 : (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
380 2 : (void) btrfs_defrag_fd(f->fd);
381 : }
382 :
383 9843 : if (f->close_fd)
384 9843 : safe_close(f->fd);
385 9843 : free(f->path);
386 :
387 9843 : mmap_cache_unref(f->mmap);
388 :
389 9843 : ordered_hashmap_free_free(f->chain_cache);
390 :
391 : #if HAVE_XZ || HAVE_LZ4
392 9843 : free(f->compress_buffer);
393 : #endif
394 :
395 : #if HAVE_GCRYPT
396 9843 : if (f->fss_file)
397 0 : munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
398 : else
399 9843 : free(f->fsprg_state);
400 :
401 9843 : free(f->fsprg_seed);
402 :
403 9843 : if (f->hmac)
404 0 : gcry_md_close(f->hmac);
405 : #endif
406 :
407 9843 : return mfree(f);
408 : }
409 :
410 25 : static int journal_file_init_header(JournalFile *f, JournalFile *template) {
411 25 : Header h = {};
412 : ssize_t k;
413 : int r;
414 :
415 25 : assert(f);
416 :
417 25 : memcpy(h.signature, HEADER_SIGNATURE, 8);
418 25 : h.header_size = htole64(ALIGN64(sizeof(h)));
419 :
420 50 : h.incompatible_flags |= htole32(
421 50 : f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
422 25 : f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
423 :
424 50 : h.compatible_flags = htole32(
425 25 : f->seal * HEADER_COMPATIBLE_SEALED);
426 :
427 25 : r = sd_id128_randomize(&h.file_id);
428 25 : if (r < 0)
429 0 : return r;
430 :
431 25 : if (template) {
432 3 : h.seqnum_id = template->header->seqnum_id;
433 3 : h.tail_entry_seqnum = template->header->tail_entry_seqnum;
434 : } else
435 22 : h.seqnum_id = h.file_id;
436 :
437 25 : k = pwrite(f->fd, &h, sizeof(h), 0);
438 25 : if (k < 0)
439 0 : return -errno;
440 :
441 25 : if (k != sizeof(h))
442 0 : return -EIO;
443 :
444 25 : return 0;
445 : }
446 :
447 26 : static int journal_file_refresh_header(JournalFile *f) {
448 : sd_id128_t boot_id;
449 : int r;
450 :
451 26 : assert(f);
452 26 : assert(f->header);
453 :
454 26 : r = sd_id128_get_machine(&f->header->machine_id);
455 26 : if (IN_SET(r, -ENOENT, -ENOMEDIUM))
456 : /* We don't have a machine-id, let's continue without */
457 0 : zero(f->header->machine_id);
458 26 : else if (r < 0)
459 0 : return r;
460 :
461 26 : r = sd_id128_get_boot(&boot_id);
462 26 : if (r < 0)
463 0 : return r;
464 :
465 26 : f->header->boot_id = boot_id;
466 :
467 26 : r = journal_file_set_online(f);
468 :
469 : /* Sync the online state to disk */
470 26 : (void) fsync(f->fd);
471 :
472 : /* We likely just created a new file, also sync the directory this file is located in. */
473 26 : (void) fsync_directory_of_file(f->fd);
474 :
475 26 : return r;
476 : }
477 :
478 9819 : static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
479 9819 : const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
480 9819 : supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
481 9819 : const char *type = compatible ? "compatible" : "incompatible";
482 : uint32_t flags;
483 :
484 9819 : flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
485 :
486 9819 : if (flags & ~supported) {
487 0 : if (flags & ~any)
488 0 : log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
489 : f->path, type, flags & ~any);
490 0 : flags = (flags & any) & ~supported;
491 0 : if (flags) {
492 : const char* strv[3];
493 0 : unsigned n = 0;
494 0 : _cleanup_free_ char *t = NULL;
495 :
496 0 : if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
497 0 : strv[n++] = "sealed";
498 0 : if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
499 0 : strv[n++] = "xz-compressed";
500 0 : if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
501 0 : strv[n++] = "lz4-compressed";
502 0 : strv[n] = NULL;
503 0 : assert(n < ELEMENTSOF(strv));
504 :
505 0 : t = strv_join((char**) strv, ", ");
506 0 : log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
507 : f->path, type, n > 1 ? "flags" : "flag", strnull(t));
508 : }
509 0 : return true;
510 : }
511 :
512 9819 : return false;
513 : }
514 :
515 9818 : static int journal_file_verify_header(JournalFile *f) {
516 : uint64_t arena_size, header_size;
517 :
518 9818 : assert(f);
519 9818 : assert(f->header);
520 :
521 9818 : if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
522 0 : return -EBADMSG;
523 :
524 : /* In both read and write mode we refuse to open files with incompatible
525 : * flags we don't know. */
526 9818 : if (warn_wrong_flags(f, false))
527 0 : return -EPROTONOSUPPORT;
528 :
529 : /* When open for writing we refuse to open files with compatible flags, too. */
530 9818 : if (f->writable && warn_wrong_flags(f, true))
531 0 : return -EPROTONOSUPPORT;
532 :
533 9818 : if (f->header->state >= _STATE_MAX)
534 0 : return -EBADMSG;
535 :
536 9818 : header_size = le64toh(f->header->header_size);
537 :
538 : /* The first addition was n_data, so check that we are at least this large */
539 9818 : if (header_size < HEADER_SIZE_MIN)
540 0 : return -EBADMSG;
541 :
542 9818 : if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
543 0 : return -EBADMSG;
544 :
545 9818 : arena_size = le64toh(f->header->arena_size);
546 :
547 9818 : if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
548 0 : return -ENODATA;
549 :
550 9818 : if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
551 0 : return -ENODATA;
552 :
553 9818 : if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
554 9818 : !VALID64(le64toh(f->header->field_hash_table_offset)) ||
555 9818 : !VALID64(le64toh(f->header->tail_object_offset)) ||
556 9818 : !VALID64(le64toh(f->header->entry_array_offset)))
557 0 : return -ENODATA;
558 :
559 9818 : if (f->writable) {
560 : sd_id128_t machine_id;
561 : uint8_t state;
562 : int r;
563 :
564 1 : r = sd_id128_get_machine(&machine_id);
565 1 : if (r < 0)
566 0 : return r;
567 :
568 1 : if (!sd_id128_equal(machine_id, f->header->machine_id))
569 0 : return -EHOSTDOWN;
570 :
571 1 : state = f->header->state;
572 :
573 1 : if (state == STATE_ARCHIVED)
574 0 : return -ESHUTDOWN; /* Already archived */
575 1 : else if (state == STATE_ONLINE)
576 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
577 : "Journal file %s is already online. Assuming unclean closing.",
578 : f->path);
579 1 : else if (state != STATE_OFFLINE)
580 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
581 : "Journal file %s has unknown state %i.",
582 : f->path, state);
583 :
584 1 : if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
585 0 : return -EBADMSG;
586 :
587 : /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
588 : * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
589 : * bisection. */
590 1 : if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
591 0 : return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
592 : "Journal file %s is from the future, refusing to append new data to it that'd be older.",
593 : f->path);
594 : }
595 :
596 9818 : f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
597 9818 : f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
598 :
599 9818 : f->seal = JOURNAL_HEADER_SEALED(f->header);
600 :
601 9818 : return 0;
602 : }
603 :
604 9894 : static int journal_file_fstat(JournalFile *f) {
605 : int r;
606 :
607 9894 : assert(f);
608 9894 : assert(f->fd >= 0);
609 :
610 9894 : if (fstat(f->fd, &f->last_stat) < 0)
611 0 : return -errno;
612 :
613 9894 : f->last_stat_usec = now(CLOCK_MONOTONIC);
614 :
615 : /* Refuse dealing with with files that aren't regular */
616 9894 : r = stat_verify_regular(&f->last_stat);
617 9894 : if (r < 0)
618 0 : return r;
619 :
620 : /* Refuse appending to files that are already deleted */
621 9894 : if (f->last_stat.st_nlink <= 0)
622 0 : return -EIDRM;
623 :
624 9894 : return 0;
625 : }
626 :
627 51105 : static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
628 : uint64_t old_size, new_size;
629 : int r;
630 :
631 51105 : assert(f);
632 51105 : assert(f->header);
633 :
634 : /* We assume that this file is not sparse, and we know that
635 : * for sure, since we always call posix_fallocate()
636 : * ourselves */
637 :
638 51105 : if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
639 0 : return -EIO;
640 :
641 51105 : old_size =
642 51105 : le64toh(f->header->header_size) +
643 51105 : le64toh(f->header->arena_size);
644 :
645 51105 : new_size = PAGE_ALIGN(offset + size);
646 51105 : if (new_size < le64toh(f->header->header_size))
647 0 : new_size = le64toh(f->header->header_size);
648 :
649 51105 : if (new_size <= old_size) {
650 :
651 : /* We already pre-allocated enough space, but before
652 : * we write to it, let's check with fstat() if the
653 : * file got deleted, in order make sure we don't throw
654 : * away the data immediately. Don't check fstat() for
655 : * all writes though, but only once ever 10s. */
656 :
657 51079 : if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
658 51079 : return 0;
659 :
660 0 : return journal_file_fstat(f);
661 : }
662 :
663 : /* Allocate more space. */
664 :
665 26 : if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
666 0 : return -E2BIG;
667 :
668 26 : if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
669 : struct statvfs svfs;
670 :
671 0 : if (fstatvfs(f->fd, &svfs) >= 0) {
672 : uint64_t available;
673 :
674 0 : available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
675 :
676 0 : if (new_size - old_size > available)
677 0 : return -E2BIG;
678 : }
679 : }
680 :
681 : /* Increase by larger blocks at once */
682 26 : new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
683 26 : if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
684 0 : new_size = f->metrics.max_size;
685 :
686 : /* Note that the glibc fallocate() fallback is very
687 : inefficient, hence we try to minimize the allocation area
688 : as we can. */
689 26 : r = posix_fallocate(f->fd, old_size, new_size - old_size);
690 26 : if (r != 0)
691 0 : return -r;
692 :
693 26 : f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
694 :
695 26 : return journal_file_fstat(f);
696 : }
697 :
698 2798797 : static unsigned type_to_context(ObjectType type) {
699 : /* One context for each type, plus one catch-all for the rest */
700 : assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
701 : assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
702 2798797 : return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
703 : }
704 :
705 2798797 : static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
706 : int r;
707 :
708 2798797 : assert(f);
709 2798797 : assert(ret);
710 :
711 2798797 : if (size <= 0)
712 0 : return -EINVAL;
713 :
714 : /* Avoid SIGBUS on invalid accesses */
715 2798797 : if (offset + size > (uint64_t) f->last_stat.st_size) {
716 : /* Hmm, out of range? Let's refresh the fstat() data
717 : * first, before we trust that check. */
718 :
719 0 : r = journal_file_fstat(f);
720 0 : if (r < 0)
721 0 : return r;
722 :
723 0 : if (offset + size > (uint64_t) f->last_stat.st_size)
724 0 : return -EADDRNOTAVAIL;
725 : }
726 :
727 2798797 : return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
728 : }
729 :
730 2747114 : static uint64_t minimum_header_size(Object *o) {
731 :
732 : static const uint64_t table[] = {
733 : [OBJECT_DATA] = sizeof(DataObject),
734 : [OBJECT_FIELD] = sizeof(FieldObject),
735 : [OBJECT_ENTRY] = sizeof(EntryObject),
736 : [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
737 : [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
738 : [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
739 : [OBJECT_TAG] = sizeof(TagObject),
740 : };
741 :
742 2747114 : if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
743 0 : return sizeof(ObjectHeader);
744 :
745 2747114 : return table[o->object.type];
746 : }
747 :
748 : /* Lightweight object checks. We want this to be fast, so that we won't
749 : * slowdown every journal_file_move_to_object() call too much. */
750 2747114 : static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
751 2747114 : assert(f);
752 2747114 : assert(o);
753 :
754 2747114 : switch (o->object.type) {
755 :
756 1041423 : case OBJECT_DATA: {
757 1041423 : if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
758 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
759 : "Bad n_entries: %" PRIu64 ": %" PRIu64,
760 : le64toh(o->data.n_entries),
761 : offset);
762 :
763 1041423 : if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
764 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
765 : "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
766 : offsetof(DataObject, payload),
767 : le64toh(o->object.size),
768 : offset);
769 :
770 1041423 : if (!VALID64(le64toh(o->data.next_hash_offset)) ||
771 1041423 : !VALID64(le64toh(o->data.next_field_offset)) ||
772 1041423 : !VALID64(le64toh(o->data.entry_offset)) ||
773 1041423 : !VALID64(le64toh(o->data.entry_array_offset)))
774 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
775 : "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
776 : le64toh(o->data.next_hash_offset),
777 : le64toh(o->data.next_field_offset),
778 : le64toh(o->data.entry_offset),
779 : le64toh(o->data.entry_array_offset),
780 : offset);
781 :
782 1041423 : break;
783 : }
784 :
785 29529 : case OBJECT_FIELD:
786 29529 : if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
787 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
788 : "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
789 : offsetof(FieldObject, payload),
790 : le64toh(o->object.size),
791 : offset);
792 :
793 29529 : if (!VALID64(le64toh(o->field.next_hash_offset)) ||
794 29529 : !VALID64(le64toh(o->field.head_data_offset)))
795 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
796 : "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
797 : le64toh(o->field.next_hash_offset),
798 : le64toh(o->field.head_data_offset),
799 : offset);
800 29529 : break;
801 :
802 310676 : case OBJECT_ENTRY:
803 310676 : if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
804 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
805 : "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
806 : offsetof(EntryObject, items),
807 : le64toh(o->object.size),
808 : offset);
809 :
810 310676 : if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
811 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
812 : "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
813 : (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
814 : offset);
815 :
816 310676 : if (le64toh(o->entry.seqnum) <= 0)
817 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
818 : "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
819 : le64toh(o->entry.seqnum),
820 : offset);
821 :
822 310676 : if (!VALID_REALTIME(le64toh(o->entry.realtime)))
823 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
824 : "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
825 : le64toh(o->entry.realtime),
826 : offset);
827 :
828 310676 : if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
829 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
830 : "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
831 : le64toh(o->entry.monotonic),
832 : offset);
833 :
834 310676 : break;
835 :
836 78 : case OBJECT_DATA_HASH_TABLE:
837 : case OBJECT_FIELD_HASH_TABLE:
838 78 : if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
839 78 : (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
840 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
841 : "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
842 : o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
843 : le64toh(o->object.size),
844 : offset);
845 :
846 78 : break;
847 :
848 1365408 : case OBJECT_ENTRY_ARRAY:
849 1365408 : if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
850 1365408 : (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
851 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
852 : "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
853 : le64toh(o->object.size),
854 : offset);
855 :
856 1365408 : if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
857 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
858 : "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
859 : le64toh(o->entry_array.next_entry_array_offset),
860 : offset);
861 :
862 1365408 : break;
863 :
864 0 : case OBJECT_TAG:
865 0 : if (le64toh(o->object.size) != sizeof(TagObject))
866 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
867 : "Invalid object tag size: %" PRIu64 ": %" PRIu64,
868 : le64toh(o->object.size),
869 : offset);
870 :
871 0 : if (!VALID_EPOCH(le64toh(o->tag.epoch)))
872 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
873 : "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
874 : le64toh(o->tag.epoch), offset);
875 :
876 0 : break;
877 : }
878 :
879 2747114 : return 0;
880 : }
881 :
882 2747552 : int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
883 : int r;
884 : void *t;
885 : size_t tsize;
886 : Object *o;
887 : uint64_t s;
888 :
889 2747552 : assert(f);
890 2747552 : assert(ret);
891 :
892 : /* Objects may only be located at multiple of 64 bit */
893 2747552 : if (!VALID64(offset))
894 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
895 : "Attempt to move to object at non-64bit boundary: %" PRIu64,
896 : offset);
897 :
898 : /* Object may not be located in the file header */
899 2747552 : if (offset < le64toh(f->header->header_size))
900 3 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
901 : "Attempt to move to object located in file header: %" PRIu64,
902 : offset);
903 :
904 2747549 : r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
905 2747549 : if (r < 0)
906 0 : return r;
907 :
908 2747549 : o = (Object*) t;
909 2747549 : s = le64toh(o->object.size);
910 :
911 2747549 : if (s == 0)
912 435 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
913 : "Attempt to move to uninitialized object: %" PRIu64,
914 : offset);
915 2747114 : if (s < sizeof(ObjectHeader))
916 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
917 : "Attempt to move to overly short object: %" PRIu64,
918 : offset);
919 :
920 2747114 : if (o->object.type <= OBJECT_UNUSED)
921 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
922 : "Attempt to move to object with invalid type: %" PRIu64,
923 : offset);
924 :
925 2747114 : if (s < minimum_header_size(o))
926 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
927 : "Attempt to move to truncated object: %" PRIu64,
928 : offset);
929 :
930 2747114 : if (type > OBJECT_UNUSED && o->object.type != type)
931 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
932 : "Attempt to move to object of unexpected type: %" PRIu64,
933 : offset);
934 :
935 2747114 : if (s > tsize) {
936 4 : r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
937 4 : if (r < 0)
938 0 : return r;
939 :
940 4 : o = (Object*) t;
941 : }
942 :
943 2747114 : r = journal_file_check_object(f, offset, o);
944 2747114 : if (r < 0)
945 0 : return r;
946 :
947 2747114 : *ret = o;
948 2747114 : return 0;
949 : }
950 :
951 16285 : static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
952 : uint64_t r;
953 :
954 16285 : assert(f);
955 16285 : assert(f->header);
956 :
957 16285 : r = le64toh(f->header->tail_entry_seqnum) + 1;
958 :
959 16285 : if (seqnum) {
960 : /* If an external seqnum counter was passed, we update
961 : * both the local and the external one, and set it to
962 : * the maximum of both */
963 :
964 7 : if (*seqnum + 1 > r)
965 1 : r = *seqnum + 1;
966 :
967 7 : *seqnum = r;
968 : }
969 :
970 16285 : f->header->tail_entry_seqnum = htole64(r);
971 :
972 16285 : if (f->header->head_entry_seqnum == 0)
973 19 : f->header->head_entry_seqnum = htole64(r);
974 :
975 16285 : return r;
976 : }
977 :
978 51105 : int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
979 : int r;
980 : uint64_t p;
981 : Object *tail, *o;
982 : void *t;
983 :
984 51105 : assert(f);
985 51105 : assert(f->header);
986 51105 : assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
987 51105 : assert(size >= sizeof(ObjectHeader));
988 51105 : assert(offset);
989 51105 : assert(ret);
990 :
991 51105 : r = journal_file_set_online(f);
992 51105 : if (r < 0)
993 0 : return r;
994 :
995 51105 : p = le64toh(f->header->tail_object_offset);
996 51105 : if (p == 0)
997 25 : p = le64toh(f->header->header_size);
998 : else {
999 51080 : r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1000 51080 : if (r < 0)
1001 0 : return r;
1002 :
1003 51080 : p += ALIGN64(le64toh(tail->object.size));
1004 : }
1005 :
1006 51105 : r = journal_file_allocate(f, p, size);
1007 51105 : if (r < 0)
1008 0 : return r;
1009 :
1010 51105 : r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1011 51105 : if (r < 0)
1012 0 : return r;
1013 :
1014 51105 : o = (Object*) t;
1015 :
1016 51105 : zero(o->object);
1017 51105 : o->object.type = type;
1018 51105 : o->object.size = htole64(size);
1019 :
1020 51105 : f->header->tail_object_offset = htole64(p);
1021 51105 : f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1022 :
1023 51105 : *ret = o;
1024 51105 : *offset = p;
1025 :
1026 51105 : return 0;
1027 : }
1028 :
1029 25 : static int journal_file_setup_data_hash_table(JournalFile *f) {
1030 : uint64_t s, p;
1031 : Object *o;
1032 : int r;
1033 :
1034 25 : assert(f);
1035 25 : assert(f->header);
1036 :
1037 : /* We estimate that we need 1 hash table entry per 768 bytes
1038 : of journal file and we want to make sure we never get
1039 : beyond 75% fill level. Calculate the hash table size for
1040 : the maximum file size based on these metrics. */
1041 :
1042 25 : s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1043 25 : if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1044 25 : s = DEFAULT_DATA_HASH_TABLE_SIZE;
1045 :
1046 25 : log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1047 :
1048 25 : r = journal_file_append_object(f,
1049 : OBJECT_DATA_HASH_TABLE,
1050 : offsetof(Object, hash_table.items) + s,
1051 : &o, &p);
1052 25 : if (r < 0)
1053 0 : return r;
1054 :
1055 25 : memzero(o->hash_table.items, s);
1056 :
1057 25 : f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1058 25 : f->header->data_hash_table_size = htole64(s);
1059 :
1060 25 : return 0;
1061 : }
1062 :
1063 25 : static int journal_file_setup_field_hash_table(JournalFile *f) {
1064 : uint64_t s, p;
1065 : Object *o;
1066 : int r;
1067 :
1068 25 : assert(f);
1069 25 : assert(f->header);
1070 :
1071 : /* We use a fixed size hash table for the fields as this
1072 : * number should grow very slowly only */
1073 :
1074 25 : s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1075 25 : r = journal_file_append_object(f,
1076 : OBJECT_FIELD_HASH_TABLE,
1077 : offsetof(Object, hash_table.items) + s,
1078 : &o, &p);
1079 25 : if (r < 0)
1080 0 : return r;
1081 :
1082 25 : memzero(o->hash_table.items, s);
1083 :
1084 25 : f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1085 25 : f->header->field_hash_table_size = htole64(s);
1086 :
1087 25 : return 0;
1088 : }
1089 :
1090 238922 : int journal_file_map_data_hash_table(JournalFile *f) {
1091 : uint64_t s, p;
1092 : void *t;
1093 : int r;
1094 :
1095 238922 : assert(f);
1096 238922 : assert(f->header);
1097 :
1098 238922 : if (f->data_hash_table)
1099 238799 : return 0;
1100 :
1101 123 : p = le64toh(f->header->data_hash_table_offset);
1102 123 : s = le64toh(f->header->data_hash_table_size);
1103 :
1104 123 : r = journal_file_move_to(f,
1105 : OBJECT_DATA_HASH_TABLE,
1106 : true,
1107 : p, s,
1108 : &t, NULL);
1109 123 : if (r < 0)
1110 0 : return r;
1111 :
1112 123 : f->data_hash_table = t;
1113 123 : return 0;
1114 : }
1115 :
1116 28284 : int journal_file_map_field_hash_table(JournalFile *f) {
1117 : uint64_t s, p;
1118 : void *t;
1119 : int r;
1120 :
1121 28284 : assert(f);
1122 28284 : assert(f->header);
1123 :
1124 28284 : if (f->field_hash_table)
1125 28268 : return 0;
1126 :
1127 16 : p = le64toh(f->header->field_hash_table_offset);
1128 16 : s = le64toh(f->header->field_hash_table_size);
1129 :
1130 16 : r = journal_file_move_to(f,
1131 : OBJECT_FIELD_HASH_TABLE,
1132 : true,
1133 : p, s,
1134 : &t, NULL);
1135 16 : if (r < 0)
1136 0 : return r;
1137 :
1138 16 : f->field_hash_table = t;
1139 16 : return 0;
1140 : }
1141 :
1142 112 : static int journal_file_link_field(
1143 : JournalFile *f,
1144 : Object *o,
1145 : uint64_t offset,
1146 : uint64_t hash) {
1147 :
1148 : uint64_t p, h, m;
1149 : int r;
1150 :
1151 112 : assert(f);
1152 112 : assert(f->header);
1153 112 : assert(f->field_hash_table);
1154 112 : assert(o);
1155 112 : assert(offset > 0);
1156 :
1157 112 : if (o->object.type != OBJECT_FIELD)
1158 0 : return -EINVAL;
1159 :
1160 112 : m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1161 112 : if (m <= 0)
1162 0 : return -EBADMSG;
1163 :
1164 : /* This might alter the window we are looking at */
1165 112 : o->field.next_hash_offset = o->field.head_data_offset = 0;
1166 :
1167 112 : h = hash % m;
1168 112 : p = le64toh(f->field_hash_table[h].tail_hash_offset);
1169 112 : if (p == 0)
1170 96 : f->field_hash_table[h].head_hash_offset = htole64(offset);
1171 : else {
1172 16 : r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1173 16 : if (r < 0)
1174 0 : return r;
1175 :
1176 16 : o->field.next_hash_offset = htole64(offset);
1177 : }
1178 :
1179 112 : f->field_hash_table[h].tail_hash_offset = htole64(offset);
1180 :
1181 112 : if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1182 112 : f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1183 :
1184 112 : return 0;
1185 : }
1186 :
1187 28288 : static int journal_file_link_data(
1188 : JournalFile *f,
1189 : Object *o,
1190 : uint64_t offset,
1191 : uint64_t hash) {
1192 :
1193 : uint64_t p, h, m;
1194 : int r;
1195 :
1196 28288 : assert(f);
1197 28288 : assert(f->header);
1198 28288 : assert(f->data_hash_table);
1199 28288 : assert(o);
1200 28288 : assert(offset > 0);
1201 :
1202 28288 : if (o->object.type != OBJECT_DATA)
1203 0 : return -EINVAL;
1204 :
1205 28288 : m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1206 28288 : if (m <= 0)
1207 0 : return -EBADMSG;
1208 :
1209 : /* This might alter the window we are looking at */
1210 28288 : o->data.next_hash_offset = o->data.next_field_offset = 0;
1211 28288 : o->data.entry_offset = o->data.entry_array_offset = 0;
1212 28288 : o->data.n_entries = 0;
1213 :
1214 28288 : h = hash % m;
1215 28288 : p = le64toh(f->data_hash_table[h].tail_hash_offset);
1216 28288 : if (p == 0)
1217 : /* Only entry in the hash table is easy */
1218 2399 : f->data_hash_table[h].head_hash_offset = htole64(offset);
1219 : else {
1220 : /* Move back to the previous data object, to patch in
1221 : * pointer */
1222 :
1223 25889 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1224 25889 : if (r < 0)
1225 0 : return r;
1226 :
1227 25889 : o->data.next_hash_offset = htole64(offset);
1228 : }
1229 :
1230 28288 : f->data_hash_table[h].tail_hash_offset = htole64(offset);
1231 :
1232 28288 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1233 28288 : f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1234 :
1235 28288 : return 0;
1236 : }
1237 :
1238 28284 : int journal_file_find_field_object_with_hash(
1239 : JournalFile *f,
1240 : const void *field, uint64_t size, uint64_t hash,
1241 : Object **ret, uint64_t *offset) {
1242 :
1243 : uint64_t p, osize, h, m;
1244 : int r;
1245 :
1246 28284 : assert(f);
1247 28284 : assert(f->header);
1248 28284 : assert(field && size > 0);
1249 :
1250 : /* If the field hash table is empty, we can't find anything */
1251 28284 : if (le64toh(f->header->field_hash_table_size) <= 0)
1252 0 : return 0;
1253 :
1254 : /* Map the field hash table, if it isn't mapped yet. */
1255 28284 : r = journal_file_map_field_hash_table(f);
1256 28284 : if (r < 0)
1257 0 : return r;
1258 :
1259 28284 : osize = offsetof(Object, field.payload) + size;
1260 :
1261 28284 : m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1262 28284 : if (m <= 0)
1263 0 : return -EBADMSG;
1264 :
1265 28284 : h = hash % m;
1266 28284 : p = le64toh(f->field_hash_table[h].head_hash_offset);
1267 :
1268 29397 : while (p > 0) {
1269 : Object *o;
1270 :
1271 29285 : r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1272 29285 : if (r < 0)
1273 28172 : return r;
1274 :
1275 57457 : if (le64toh(o->field.hash) == hash &&
1276 28172 : le64toh(o->object.size) == osize &&
1277 28172 : memcmp(o->field.payload, field, size) == 0) {
1278 :
1279 28172 : if (ret)
1280 28172 : *ret = o;
1281 28172 : if (offset)
1282 28169 : *offset = p;
1283 :
1284 28172 : return 1;
1285 : }
1286 :
1287 1113 : p = le64toh(o->field.next_hash_offset);
1288 : }
1289 :
1290 112 : return 0;
1291 : }
1292 :
1293 3 : int journal_file_find_field_object(
1294 : JournalFile *f,
1295 : const void *field, uint64_t size,
1296 : Object **ret, uint64_t *offset) {
1297 :
1298 : uint64_t hash;
1299 :
1300 3 : assert(f);
1301 3 : assert(field && size > 0);
1302 :
1303 3 : hash = hash64(field, size);
1304 :
1305 3 : return journal_file_find_field_object_with_hash(f,
1306 : field, size, hash,
1307 : ret, offset);
1308 : }
1309 :
1310 232921 : int journal_file_find_data_object_with_hash(
1311 : JournalFile *f,
1312 : const void *data, uint64_t size, uint64_t hash,
1313 : Object **ret, uint64_t *offset) {
1314 :
1315 : uint64_t p, osize, h, m;
1316 : int r;
1317 :
1318 232921 : assert(f);
1319 232921 : assert(f->header);
1320 232921 : assert(data || size == 0);
1321 :
1322 : /* If there's no data hash table, then there's no entry. */
1323 232921 : if (le64toh(f->header->data_hash_table_size) <= 0)
1324 0 : return 0;
1325 :
1326 : /* Map the data hash table, if it isn't mapped yet. */
1327 232921 : r = journal_file_map_data_hash_table(f);
1328 232921 : if (r < 0)
1329 0 : return r;
1330 :
1331 232921 : osize = offsetof(Object, data.payload) + size;
1332 :
1333 232921 : m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1334 232921 : if (m <= 0)
1335 0 : return -EBADMSG;
1336 :
1337 232921 : h = hash % m;
1338 232921 : p = le64toh(f->data_hash_table[h].head_hash_offset);
1339 :
1340 521460 : while (p > 0) {
1341 : Object *o;
1342 :
1343 492867 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1344 492867 : if (r < 0)
1345 204328 : return r;
1346 :
1347 492867 : if (le64toh(o->data.hash) != hash)
1348 288539 : goto next;
1349 :
1350 204328 : if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1351 : #if HAVE_XZ || HAVE_LZ4
1352 : uint64_t l;
1353 0 : size_t rsize = 0;
1354 :
1355 0 : l = le64toh(o->object.size);
1356 0 : if (l <= offsetof(Object, data.payload))
1357 0 : return -EBADMSG;
1358 :
1359 0 : l -= offsetof(Object, data.payload);
1360 :
1361 0 : r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1362 0 : o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1363 0 : if (r < 0)
1364 0 : return r;
1365 :
1366 0 : if (rsize == size &&
1367 0 : memcmp(f->compress_buffer, data, size) == 0) {
1368 :
1369 0 : if (ret)
1370 0 : *ret = o;
1371 :
1372 0 : if (offset)
1373 0 : *offset = p;
1374 :
1375 0 : return 1;
1376 : }
1377 : #else
1378 : return -EPROTONOSUPPORT;
1379 : #endif
1380 204328 : } else if (le64toh(o->object.size) == osize &&
1381 204328 : memcmp(o->data.payload, data, size) == 0) {
1382 :
1383 204328 : if (ret)
1384 203725 : *ret = o;
1385 :
1386 204328 : if (offset)
1387 204268 : *offset = p;
1388 :
1389 204328 : return 1;
1390 : }
1391 :
1392 0 : next:
1393 288539 : p = le64toh(o->data.next_hash_offset);
1394 : }
1395 :
1396 28593 : return 0;
1397 : }
1398 :
1399 7 : int journal_file_find_data_object(
1400 : JournalFile *f,
1401 : const void *data, uint64_t size,
1402 : Object **ret, uint64_t *offset) {
1403 :
1404 : uint64_t hash;
1405 :
1406 7 : assert(f);
1407 7 : assert(data || size == 0);
1408 :
1409 7 : hash = hash64(data, size);
1410 :
1411 7 : return journal_file_find_data_object_with_hash(f,
1412 : data, size, hash,
1413 : ret, offset);
1414 : }
1415 :
1416 28281 : static int journal_file_append_field(
1417 : JournalFile *f,
1418 : const void *field, uint64_t size,
1419 : Object **ret, uint64_t *offset) {
1420 :
1421 : uint64_t hash, p;
1422 : uint64_t osize;
1423 : Object *o;
1424 : int r;
1425 :
1426 28281 : assert(f);
1427 28281 : assert(field && size > 0);
1428 :
1429 28281 : hash = hash64(field, size);
1430 :
1431 28281 : r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1432 28281 : if (r < 0)
1433 0 : return r;
1434 28281 : else if (r > 0) {
1435 :
1436 28169 : if (ret)
1437 28169 : *ret = o;
1438 :
1439 28169 : if (offset)
1440 28169 : *offset = p;
1441 :
1442 28169 : return 0;
1443 : }
1444 :
1445 112 : osize = offsetof(Object, field.payload) + size;
1446 112 : r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1447 112 : if (r < 0)
1448 0 : return r;
1449 :
1450 112 : o->field.hash = htole64(hash);
1451 112 : memcpy(o->field.payload, field, size);
1452 :
1453 112 : r = journal_file_link_field(f, o, p, hash);
1454 112 : if (r < 0)
1455 0 : return r;
1456 :
1457 : /* The linking might have altered the window, so let's
1458 : * refresh our pointer */
1459 112 : r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1460 112 : if (r < 0)
1461 0 : return r;
1462 :
1463 : #if HAVE_GCRYPT
1464 112 : r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1465 112 : if (r < 0)
1466 0 : return r;
1467 : #endif
1468 :
1469 112 : if (ret)
1470 112 : *ret = o;
1471 :
1472 112 : if (offset)
1473 112 : *offset = p;
1474 :
1475 112 : return 0;
1476 : }
1477 :
1478 232013 : static int journal_file_append_data(
1479 : JournalFile *f,
1480 : const void *data, uint64_t size,
1481 : Object **ret, uint64_t *offset) {
1482 :
1483 : uint64_t hash, p;
1484 : uint64_t osize;
1485 : Object *o;
1486 232013 : int r, compression = 0;
1487 : const void *eq;
1488 :
1489 232013 : assert(f);
1490 232013 : assert(data || size == 0);
1491 :
1492 232013 : hash = hash64(data, size);
1493 :
1494 232013 : r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1495 232013 : if (r < 0)
1496 0 : return r;
1497 232013 : if (r > 0) {
1498 :
1499 203725 : if (ret)
1500 203725 : *ret = o;
1501 :
1502 203725 : if (offset)
1503 203725 : *offset = p;
1504 :
1505 203725 : return 0;
1506 : }
1507 :
1508 28288 : osize = offsetof(Object, data.payload) + size;
1509 28288 : r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1510 28288 : if (r < 0)
1511 0 : return r;
1512 :
1513 28288 : o->data.hash = htole64(hash);
1514 :
1515 : #if HAVE_XZ || HAVE_LZ4
1516 28288 : if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1517 4 : size_t rsize = 0;
1518 :
1519 4 : compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1520 :
1521 4 : if (compression >= 0) {
1522 4 : o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1523 4 : o->object.flags |= compression;
1524 :
1525 4 : log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1526 : size, rsize, object_compressed_to_string(compression));
1527 : } else
1528 : /* Compression didn't work, we don't really care why, let's continue without compression */
1529 0 : compression = 0;
1530 : }
1531 : #endif
1532 :
1533 28288 : if (compression == 0)
1534 28284 : memcpy_safe(o->data.payload, data, size);
1535 :
1536 28288 : r = journal_file_link_data(f, o, p, hash);
1537 28288 : if (r < 0)
1538 0 : return r;
1539 :
1540 : #if HAVE_GCRYPT
1541 28288 : r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1542 28288 : if (r < 0)
1543 0 : return r;
1544 : #endif
1545 :
1546 : /* The linking might have altered the window, so let's
1547 : * refresh our pointer */
1548 28288 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1549 28288 : if (r < 0)
1550 0 : return r;
1551 :
1552 28288 : if (!data)
1553 0 : eq = NULL;
1554 : else
1555 28288 : eq = memchr(data, '=', size);
1556 28288 : if (eq && eq > data) {
1557 28281 : Object *fo = NULL;
1558 : uint64_t fp;
1559 :
1560 : /* Create field object ... */
1561 28281 : r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1562 28281 : if (r < 0)
1563 0 : return r;
1564 :
1565 : /* ... and link it in. */
1566 28281 : o->data.next_field_offset = fo->field.head_data_offset;
1567 28281 : fo->field.head_data_offset = le64toh(p);
1568 : }
1569 :
1570 28288 : if (ret)
1571 28288 : *ret = o;
1572 :
1573 28288 : if (offset)
1574 28288 : *offset = p;
1575 :
1576 28288 : return 0;
1577 : }
1578 :
1579 50891 : uint64_t journal_file_entry_n_items(Object *o) {
1580 50891 : assert(o);
1581 :
1582 50891 : if (o->object.type != OBJECT_ENTRY)
1583 0 : return 0;
1584 :
1585 50891 : return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1586 : }
1587 :
1588 1362640 : uint64_t journal_file_entry_array_n_items(Object *o) {
1589 1362640 : assert(o);
1590 :
1591 1362640 : if (o->object.type != OBJECT_ENTRY_ARRAY)
1592 0 : return 0;
1593 :
1594 1362640 : return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1595 : }
1596 :
1597 2382 : uint64_t journal_file_hash_table_n_items(Object *o) {
1598 2382 : assert(o);
1599 :
1600 2382 : if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1601 0 : return 0;
1602 :
1603 2382 : return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1604 : }
1605 :
1606 220010 : static int link_entry_into_array(JournalFile *f,
1607 : le64_t *first,
1608 : le64_t *idx,
1609 : uint64_t p) {
1610 : int r;
1611 220010 : uint64_t n = 0, ap = 0, q, i, a, hidx;
1612 : Object *o;
1613 :
1614 220010 : assert(f);
1615 220010 : assert(f->header);
1616 220010 : assert(first);
1617 220010 : assert(idx);
1618 220010 : assert(p > 0);
1619 :
1620 220010 : a = le64toh(*first);
1621 220010 : i = hidx = le64toh(*idx);
1622 1284082 : while (a > 0) {
1623 :
1624 1277712 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1625 1277712 : if (r < 0)
1626 0 : return r;
1627 :
1628 1277712 : n = journal_file_entry_array_n_items(o);
1629 1277712 : if (i < n) {
1630 213640 : o->entry_array.items[i] = htole64(p);
1631 213640 : *idx = htole64(hidx + 1);
1632 213640 : return 0;
1633 : }
1634 :
1635 1064072 : i -= n;
1636 1064072 : ap = a;
1637 1064072 : a = le64toh(o->entry_array.next_entry_array_offset);
1638 : }
1639 :
1640 6370 : if (hidx > n)
1641 1209 : n = (hidx+1) * 2;
1642 : else
1643 5161 : n = n * 2;
1644 :
1645 6370 : if (n < 4)
1646 3804 : n = 4;
1647 :
1648 6370 : r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1649 : offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1650 : &o, &q);
1651 6370 : if (r < 0)
1652 0 : return r;
1653 :
1654 : #if HAVE_GCRYPT
1655 6370 : r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1656 6370 : if (r < 0)
1657 0 : return r;
1658 : #endif
1659 :
1660 6370 : o->entry_array.items[i] = htole64(p);
1661 :
1662 6370 : if (ap == 0)
1663 3804 : *first = htole64(q);
1664 : else {
1665 2566 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1666 2566 : if (r < 0)
1667 0 : return r;
1668 :
1669 2566 : o->entry_array.next_entry_array_offset = htole64(q);
1670 : }
1671 :
1672 6370 : if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1673 6370 : f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1674 :
1675 6370 : *idx = htole64(hidx + 1);
1676 :
1677 6370 : return 0;
1678 : }
1679 :
1680 232013 : static int link_entry_into_array_plus_one(JournalFile *f,
1681 : le64_t *extra,
1682 : le64_t *first,
1683 : le64_t *idx,
1684 : uint64_t p) {
1685 :
1686 : int r;
1687 :
1688 232013 : assert(f);
1689 232013 : assert(extra);
1690 232013 : assert(first);
1691 232013 : assert(idx);
1692 232013 : assert(p > 0);
1693 :
1694 232013 : if (*idx == 0)
1695 28288 : *extra = htole64(p);
1696 : else {
1697 : le64_t i;
1698 :
1699 203725 : i = htole64(le64toh(*idx) - 1);
1700 203725 : r = link_entry_into_array(f, first, &i, p);
1701 203725 : if (r < 0)
1702 0 : return r;
1703 : }
1704 :
1705 232013 : *idx = htole64(le64toh(*idx) + 1);
1706 232013 : return 0;
1707 : }
1708 :
1709 232013 : static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1710 : uint64_t p;
1711 : int r;
1712 232013 : assert(f);
1713 232013 : assert(o);
1714 232013 : assert(offset > 0);
1715 :
1716 232013 : p = le64toh(o->entry.items[i].object_offset);
1717 232013 : if (p == 0)
1718 0 : return -EINVAL;
1719 :
1720 232013 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1721 232013 : if (r < 0)
1722 0 : return r;
1723 :
1724 696039 : return link_entry_into_array_plus_one(f,
1725 232013 : &o->data.entry_offset,
1726 232013 : &o->data.entry_array_offset,
1727 232013 : &o->data.n_entries,
1728 : offset);
1729 : }
1730 :
1731 16285 : static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1732 : uint64_t n, i;
1733 : int r;
1734 :
1735 16285 : assert(f);
1736 16285 : assert(f->header);
1737 16285 : assert(o);
1738 16285 : assert(offset > 0);
1739 :
1740 16285 : if (o->object.type != OBJECT_ENTRY)
1741 0 : return -EINVAL;
1742 :
1743 16285 : __sync_synchronize();
1744 :
1745 : /* Link up the entry itself */
1746 32570 : r = link_entry_into_array(f,
1747 16285 : &f->header->entry_array_offset,
1748 16285 : &f->header->n_entries,
1749 : offset);
1750 16285 : if (r < 0)
1751 0 : return r;
1752 :
1753 : /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1754 :
1755 16285 : if (f->header->head_entry_realtime == 0)
1756 19 : f->header->head_entry_realtime = o->entry.realtime;
1757 :
1758 16285 : f->header->tail_entry_realtime = o->entry.realtime;
1759 16285 : f->header->tail_entry_monotonic = o->entry.monotonic;
1760 :
1761 : /* Link up the items */
1762 16285 : n = journal_file_entry_n_items(o);
1763 248298 : for (i = 0; i < n; i++) {
1764 232013 : r = journal_file_link_entry_item(f, o, offset, i);
1765 232013 : if (r < 0)
1766 0 : return r;
1767 : }
1768 :
1769 16285 : return 0;
1770 : }
1771 :
1772 16285 : static int journal_file_append_entry_internal(
1773 : JournalFile *f,
1774 : const dual_timestamp *ts,
1775 : const sd_id128_t *boot_id,
1776 : uint64_t xor_hash,
1777 : const EntryItem items[], unsigned n_items,
1778 : uint64_t *seqnum,
1779 : Object **ret, uint64_t *offset) {
1780 : uint64_t np;
1781 : uint64_t osize;
1782 : Object *o;
1783 : int r;
1784 :
1785 16285 : assert(f);
1786 16285 : assert(f->header);
1787 16285 : assert(items || n_items == 0);
1788 16285 : assert(ts);
1789 :
1790 16285 : osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1791 :
1792 16285 : r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1793 16285 : if (r < 0)
1794 0 : return r;
1795 :
1796 16285 : o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1797 16285 : memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1798 16285 : o->entry.realtime = htole64(ts->realtime);
1799 16285 : o->entry.monotonic = htole64(ts->monotonic);
1800 16285 : o->entry.xor_hash = htole64(xor_hash);
1801 16285 : if (boot_id)
1802 10001 : f->header->boot_id = *boot_id;
1803 16285 : o->entry.boot_id = f->header->boot_id;
1804 :
1805 : #if HAVE_GCRYPT
1806 16285 : r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1807 16285 : if (r < 0)
1808 0 : return r;
1809 : #endif
1810 :
1811 16285 : r = journal_file_link_entry(f, o, np);
1812 16285 : if (r < 0)
1813 0 : return r;
1814 :
1815 16285 : if (ret)
1816 0 : *ret = o;
1817 :
1818 16285 : if (offset)
1819 0 : *offset = np;
1820 :
1821 16285 : return 0;
1822 : }
1823 :
1824 6285 : void journal_file_post_change(JournalFile *f) {
1825 6285 : assert(f);
1826 :
1827 6285 : if (f->fd < 0)
1828 0 : return;
1829 :
1830 : /* inotify() does not receive IN_MODIFY events from file
1831 : * accesses done via mmap(). After each access we hence
1832 : * trigger IN_MODIFY by truncating the journal file to its
1833 : * current size which triggers IN_MODIFY. */
1834 :
1835 6285 : __sync_synchronize();
1836 :
1837 6285 : if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1838 0 : log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1839 : }
1840 :
1841 0 : static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1842 0 : assert(userdata);
1843 :
1844 0 : journal_file_post_change(userdata);
1845 :
1846 0 : return 1;
1847 : }
1848 :
1849 0 : static void schedule_post_change(JournalFile *f) {
1850 : uint64_t now;
1851 : int r;
1852 :
1853 0 : assert(f);
1854 0 : assert(f->post_change_timer);
1855 :
1856 0 : r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1857 0 : if (r < 0) {
1858 0 : log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1859 0 : goto fail;
1860 : }
1861 0 : if (r > 0)
1862 0 : return;
1863 :
1864 0 : r = sd_event_now(sd_event_source_get_event(f->post_change_timer), CLOCK_MONOTONIC, &now);
1865 0 : if (r < 0) {
1866 0 : log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1867 0 : goto fail;
1868 : }
1869 :
1870 0 : r = sd_event_source_set_time(f->post_change_timer, now + f->post_change_timer_period);
1871 0 : if (r < 0) {
1872 0 : log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1873 0 : goto fail;
1874 : }
1875 :
1876 0 : r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1877 0 : if (r < 0) {
1878 0 : log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1879 0 : goto fail;
1880 : }
1881 :
1882 0 : return;
1883 :
1884 0 : fail:
1885 : /* On failure, let's simply post the change immediately. */
1886 0 : journal_file_post_change(f);
1887 : }
1888 :
1889 : /* Enable coalesced change posting in a timer on the provided sd_event instance */
1890 0 : int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1891 0 : _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1892 : int r;
1893 :
1894 0 : assert(f);
1895 0 : assert_return(!f->post_change_timer, -EINVAL);
1896 0 : assert(e);
1897 0 : assert(t);
1898 :
1899 0 : r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1900 0 : if (r < 0)
1901 0 : return r;
1902 :
1903 0 : r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1904 0 : if (r < 0)
1905 0 : return r;
1906 :
1907 0 : f->post_change_timer = TAKE_PTR(timer);
1908 0 : f->post_change_timer_period = t;
1909 :
1910 0 : return r;
1911 : }
1912 :
1913 260 : static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
1914 260 : return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1915 : }
1916 :
1917 6285 : int journal_file_append_entry(
1918 : JournalFile *f,
1919 : const dual_timestamp *ts,
1920 : const sd_id128_t *boot_id,
1921 : const struct iovec iovec[], unsigned n_iovec,
1922 : uint64_t *seqnum,
1923 : Object **ret, uint64_t *offset) {
1924 :
1925 : unsigned i;
1926 : EntryItem *items;
1927 : int r;
1928 6285 : uint64_t xor_hash = 0;
1929 : struct dual_timestamp _ts;
1930 :
1931 6285 : assert(f);
1932 6285 : assert(f->header);
1933 6285 : assert(iovec || n_iovec == 0);
1934 :
1935 6285 : if (ts) {
1936 6285 : if (!VALID_REALTIME(ts->realtime))
1937 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1938 : "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
1939 : ts->realtime);
1940 6285 : if (!VALID_MONOTONIC(ts->monotonic))
1941 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1942 : "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
1943 : ts->monotonic);
1944 : } else {
1945 0 : dual_timestamp_get(&_ts);
1946 0 : ts = &_ts;
1947 : }
1948 :
1949 : #if HAVE_GCRYPT
1950 6285 : r = journal_file_maybe_append_tag(f, ts->realtime);
1951 6285 : if (r < 0)
1952 0 : return r;
1953 : #endif
1954 :
1955 : /* alloca() can't take 0, hence let's allocate at least one */
1956 6285 : items = newa(EntryItem, MAX(1u, n_iovec));
1957 :
1958 12830 : for (i = 0; i < n_iovec; i++) {
1959 : uint64_t p;
1960 : Object *o;
1961 :
1962 6545 : r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1963 6545 : if (r < 0)
1964 0 : return r;
1965 :
1966 6545 : xor_hash ^= le64toh(o->data.hash);
1967 6545 : items[i].object_offset = htole64(p);
1968 6545 : items[i].hash = o->data.hash;
1969 : }
1970 :
1971 : /* Order by the position on disk, in order to improve seek
1972 : * times for rotating media. */
1973 6285 : typesafe_qsort(items, n_iovec, entry_item_cmp);
1974 :
1975 6285 : r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
1976 :
1977 : /* If the memory mapping triggered a SIGBUS then we return an
1978 : * IO error and ignore the error code passed down to us, since
1979 : * it is very likely just an effect of a nullified replacement
1980 : * mapping page */
1981 :
1982 6285 : if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
1983 0 : r = -EIO;
1984 :
1985 6285 : if (f->post_change_timer)
1986 0 : schedule_post_change(f);
1987 : else
1988 6285 : journal_file_post_change(f);
1989 :
1990 6285 : return r;
1991 : }
1992 :
1993 : typedef struct ChainCacheItem {
1994 : uint64_t first; /* the array at the beginning of the chain */
1995 : uint64_t array; /* the cached array */
1996 : uint64_t begin; /* the first item in the cached array */
1997 : uint64_t total; /* the total number of items in all arrays before this one in the chain */
1998 : uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1999 : } ChainCacheItem;
2000 :
2001 21653 : static void chain_cache_put(
2002 : OrderedHashmap *h,
2003 : ChainCacheItem *ci,
2004 : uint64_t first,
2005 : uint64_t array,
2006 : uint64_t begin,
2007 : uint64_t total,
2008 : uint64_t last_index) {
2009 :
2010 21653 : if (!ci) {
2011 : /* If the chain item to cache for this chain is the
2012 : * first one it's not worth caching anything */
2013 354 : if (array == first)
2014 243 : return;
2015 :
2016 111 : if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2017 0 : ci = ordered_hashmap_steal_first(h);
2018 0 : assert(ci);
2019 : } else {
2020 111 : ci = new(ChainCacheItem, 1);
2021 111 : if (!ci)
2022 0 : return;
2023 : }
2024 :
2025 111 : ci->first = first;
2026 :
2027 111 : if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2028 0 : free(ci);
2029 0 : return;
2030 : }
2031 : } else
2032 21299 : assert(ci->first == first);
2033 :
2034 21410 : ci->array = array;
2035 21410 : ci->begin = begin;
2036 21410 : ci->total = total;
2037 21410 : ci->last_index = last_index;
2038 : }
2039 :
2040 10947 : static int generic_array_get(
2041 : JournalFile *f,
2042 : uint64_t first,
2043 : uint64_t i,
2044 : Object **ret, uint64_t *offset) {
2045 :
2046 : Object *o;
2047 10947 : uint64_t p = 0, a, t = 0;
2048 : int r;
2049 : ChainCacheItem *ci;
2050 :
2051 10947 : assert(f);
2052 :
2053 10947 : a = first;
2054 :
2055 : /* Try the chain cache first */
2056 10947 : ci = ordered_hashmap_get(f->chain_cache, &first);
2057 10947 : if (ci && i > ci->total) {
2058 10674 : a = ci->array;
2059 10674 : i -= ci->total;
2060 10674 : t = ci->total;
2061 : }
2062 :
2063 11811 : while (a > 0) {
2064 : uint64_t k;
2065 :
2066 11811 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2067 11811 : if (r < 0)
2068 0 : return r;
2069 :
2070 11811 : k = journal_file_entry_array_n_items(o);
2071 11811 : if (i < k) {
2072 10947 : p = le64toh(o->entry_array.items[i]);
2073 10947 : goto found;
2074 : }
2075 :
2076 864 : i -= k;
2077 864 : t += k;
2078 864 : a = le64toh(o->entry_array.next_entry_array_offset);
2079 : }
2080 :
2081 0 : return 0;
2082 :
2083 10947 : found:
2084 : /* Let's cache this item for the next invocation */
2085 10947 : chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2086 :
2087 10947 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2088 10947 : if (r < 0)
2089 438 : return r;
2090 :
2091 10509 : if (ret)
2092 10404 : *ret = o;
2093 :
2094 10509 : if (offset)
2095 10509 : *offset = p;
2096 :
2097 10509 : return 1;
2098 : }
2099 :
2100 559 : static int generic_array_get_plus_one(
2101 : JournalFile *f,
2102 : uint64_t extra,
2103 : uint64_t first,
2104 : uint64_t i,
2105 : Object **ret, uint64_t *offset) {
2106 :
2107 : Object *o;
2108 :
2109 559 : assert(f);
2110 :
2111 559 : if (i == 0) {
2112 : int r;
2113 :
2114 15 : r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2115 15 : if (r < 0)
2116 0 : return r;
2117 :
2118 15 : if (ret)
2119 3 : *ret = o;
2120 :
2121 15 : if (offset)
2122 15 : *offset = extra;
2123 :
2124 15 : return 1;
2125 : }
2126 :
2127 544 : return generic_array_get(f, first, i-1, ret, offset);
2128 : }
2129 :
2130 : enum {
2131 : TEST_FOUND,
2132 : TEST_LEFT,
2133 : TEST_RIGHT
2134 : };
2135 :
2136 10720 : static int generic_array_bisect(
2137 : JournalFile *f,
2138 : uint64_t first,
2139 : uint64_t n,
2140 : uint64_t needle,
2141 : int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2142 : direction_t direction,
2143 : Object **ret,
2144 : uint64_t *offset,
2145 : uint64_t *idx) {
2146 :
2147 10720 : uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2148 10720 : bool subtract_one = false;
2149 10720 : Object *o, *array = NULL;
2150 : int r;
2151 : ChainCacheItem *ci;
2152 :
2153 10720 : assert(f);
2154 10720 : assert(test_object);
2155 :
2156 : /* Start with the first array in the chain */
2157 10720 : a = first;
2158 :
2159 10720 : ci = ordered_hashmap_get(f->chain_cache, &first);
2160 10720 : if (ci && n > ci->total && ci->begin != 0) {
2161 : /* Ah, we have iterated this bisection array chain
2162 : * previously! Let's see if we can skip ahead in the
2163 : * chain, as far as the last time. But we can't jump
2164 : * backwards in the chain, so let's check that
2165 : * first. */
2166 :
2167 10634 : r = test_object(f, ci->begin, needle);
2168 10634 : if (r < 0)
2169 0 : return r;
2170 :
2171 10634 : if (r == TEST_LEFT) {
2172 : /* OK, what we are looking for is right of the
2173 : * begin of this EntryArray, so let's jump
2174 : * straight to previously cached array in the
2175 : * chain */
2176 :
2177 10594 : a = ci->array;
2178 10594 : n -= ci->total;
2179 10594 : t = ci->total;
2180 10594 : last_index = ci->last_index;
2181 : }
2182 : }
2183 :
2184 10811 : while (a > 0) {
2185 : uint64_t left, right, k, lp;
2186 :
2187 10807 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2188 10807 : if (r < 0)
2189 0 : return r;
2190 :
2191 10807 : k = journal_file_entry_array_n_items(array);
2192 10807 : right = MIN(k, n);
2193 10807 : if (right <= 0)
2194 0 : return 0;
2195 :
2196 10807 : i = right - 1;
2197 10807 : lp = p = le64toh(array->entry_array.items[i]);
2198 10807 : if (p <= 0)
2199 6 : r = -EBADMSG;
2200 : else
2201 10801 : r = test_object(f, p, needle);
2202 10807 : if (r == -EBADMSG) {
2203 6 : log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2204 6 : n = i;
2205 6 : continue;
2206 : }
2207 10801 : if (r < 0)
2208 0 : return r;
2209 :
2210 10801 : if (r == TEST_FOUND)
2211 153 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2212 :
2213 10801 : if (r == TEST_RIGHT) {
2214 10602 : left = 0;
2215 10602 : right -= 1;
2216 :
2217 10602 : if (last_index != (uint64_t) -1) {
2218 10475 : assert(last_index <= right);
2219 :
2220 : /* If we cached the last index we
2221 : * looked at, let's try to not to jump
2222 : * too wildly around and see if we can
2223 : * limit the range to look at early to
2224 : * the immediate neighbors of the last
2225 : * index we looked at. */
2226 :
2227 10475 : if (last_index > 0) {
2228 10459 : uint64_t x = last_index - 1;
2229 :
2230 10459 : p = le64toh(array->entry_array.items[x]);
2231 10459 : if (p <= 0)
2232 0 : return -EBADMSG;
2233 :
2234 10459 : r = test_object(f, p, needle);
2235 10459 : if (r < 0)
2236 0 : return r;
2237 :
2238 10459 : if (r == TEST_FOUND)
2239 10 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2240 :
2241 10459 : if (r == TEST_RIGHT)
2242 59 : right = x;
2243 : else
2244 10400 : left = x + 1;
2245 : }
2246 :
2247 10475 : if (last_index < right) {
2248 10384 : uint64_t y = last_index + 1;
2249 :
2250 10384 : p = le64toh(array->entry_array.items[y]);
2251 10384 : if (p <= 0)
2252 0 : return -EBADMSG;
2253 :
2254 10384 : r = test_object(f, p, needle);
2255 10384 : if (r < 0)
2256 0 : return r;
2257 :
2258 10384 : if (r == TEST_FOUND)
2259 1 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2260 :
2261 10384 : if (r == TEST_RIGHT)
2262 10384 : right = y;
2263 : else
2264 0 : left = y + 1;
2265 : }
2266 : }
2267 :
2268 : for (;;) {
2269 22092 : if (left == right) {
2270 10602 : if (direction == DIRECTION_UP)
2271 196 : subtract_one = true;
2272 :
2273 10602 : i = left;
2274 10602 : goto found;
2275 : }
2276 :
2277 11490 : assert(left < right);
2278 11490 : i = (left + right) / 2;
2279 :
2280 11490 : p = le64toh(array->entry_array.items[i]);
2281 11490 : if (p <= 0)
2282 0 : r = -EBADMSG;
2283 : else
2284 11490 : r = test_object(f, p, needle);
2285 11490 : if (r == -EBADMSG) {
2286 0 : log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2287 0 : right = n = i;
2288 0 : continue;
2289 : }
2290 11490 : if (r < 0)
2291 0 : return r;
2292 :
2293 11490 : if (r == TEST_FOUND)
2294 10367 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2295 :
2296 11490 : if (r == TEST_RIGHT)
2297 10620 : right = i;
2298 : else
2299 870 : left = i + 1;
2300 : }
2301 : }
2302 :
2303 199 : if (k >= n) {
2304 114 : if (direction == DIRECTION_UP) {
2305 107 : i = n;
2306 107 : subtract_one = true;
2307 107 : goto found;
2308 : }
2309 :
2310 7 : return 0;
2311 : }
2312 :
2313 85 : last_p = lp;
2314 :
2315 85 : n -= k;
2316 85 : t += k;
2317 85 : last_index = (uint64_t) -1;
2318 85 : a = le64toh(array->entry_array.next_entry_array_offset);
2319 : }
2320 :
2321 4 : return 0;
2322 :
2323 10709 : found:
2324 10709 : if (subtract_one && t == 0 && i == 0)
2325 3 : return 0;
2326 :
2327 : /* Let's cache this item for the next invocation */
2328 10706 : chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2329 :
2330 10706 : if (subtract_one && i == 0)
2331 5 : p = last_p;
2332 10701 : else if (subtract_one)
2333 295 : p = le64toh(array->entry_array.items[i-1]);
2334 : else
2335 10406 : p = le64toh(array->entry_array.items[i]);
2336 :
2337 10706 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2338 10706 : if (r < 0)
2339 0 : return r;
2340 :
2341 10706 : if (ret)
2342 11 : *ret = o;
2343 :
2344 10706 : if (offset)
2345 397 : *offset = p;
2346 :
2347 10706 : if (idx)
2348 10306 : *idx = t + i + (subtract_one ? -1 : 0);
2349 :
2350 10706 : return 1;
2351 : }
2352 :
2353 424 : static int generic_array_bisect_plus_one(
2354 : JournalFile *f,
2355 : uint64_t extra,
2356 : uint64_t first,
2357 : uint64_t n,
2358 : uint64_t needle,
2359 : int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2360 : direction_t direction,
2361 : Object **ret,
2362 : uint64_t *offset,
2363 : uint64_t *idx) {
2364 :
2365 : int r;
2366 424 : bool step_back = false;
2367 : Object *o;
2368 :
2369 424 : assert(f);
2370 424 : assert(test_object);
2371 :
2372 424 : if (n <= 0)
2373 0 : return 0;
2374 :
2375 : /* This bisects the array in object 'first', but first checks
2376 : * an extra */
2377 424 : r = test_object(f, extra, needle);
2378 424 : if (r < 0)
2379 0 : return r;
2380 :
2381 424 : if (r == TEST_FOUND)
2382 16 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2383 :
2384 : /* if we are looking with DIRECTION_UP then we need to first
2385 : see if in the actual array there is a matching entry, and
2386 : return the last one of that. But if there isn't any we need
2387 : to return this one. Hence remember this, and return it
2388 : below. */
2389 424 : if (r == TEST_LEFT)
2390 402 : step_back = direction == DIRECTION_UP;
2391 :
2392 424 : if (r == TEST_RIGHT) {
2393 22 : if (direction == DIRECTION_DOWN)
2394 19 : goto found;
2395 : else
2396 3 : return 0;
2397 : }
2398 :
2399 402 : r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2400 :
2401 402 : if (r == 0 && step_back)
2402 3 : goto found;
2403 :
2404 399 : if (r > 0 && idx)
2405 0 : (*idx)++;
2406 :
2407 399 : return r;
2408 :
2409 22 : found:
2410 22 : r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2411 22 : if (r < 0)
2412 0 : return r;
2413 :
2414 22 : if (ret)
2415 0 : *ret = o;
2416 :
2417 22 : if (offset)
2418 22 : *offset = extra;
2419 :
2420 22 : if (idx)
2421 0 : *idx = 0;
2422 :
2423 22 : return 1;
2424 : }
2425 :
2426 54171 : _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2427 54171 : assert(f);
2428 54171 : assert(p > 0);
2429 :
2430 54171 : if (p == needle)
2431 10561 : return TEST_FOUND;
2432 43610 : else if (p < needle)
2433 22245 : return TEST_LEFT;
2434 : else
2435 21365 : return TEST_RIGHT;
2436 : }
2437 :
2438 15 : static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2439 : Object *o;
2440 : int r;
2441 :
2442 15 : assert(f);
2443 15 : assert(p > 0);
2444 :
2445 15 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2446 15 : if (r < 0)
2447 0 : return r;
2448 :
2449 15 : if (le64toh(o->entry.seqnum) == needle)
2450 7 : return TEST_FOUND;
2451 8 : else if (le64toh(o->entry.seqnum) < needle)
2452 3 : return TEST_LEFT;
2453 : else
2454 5 : return TEST_RIGHT;
2455 : }
2456 :
2457 8 : int journal_file_move_to_entry_by_seqnum(
2458 : JournalFile *f,
2459 : uint64_t seqnum,
2460 : direction_t direction,
2461 : Object **ret,
2462 : uint64_t *offset) {
2463 8 : assert(f);
2464 8 : assert(f->header);
2465 :
2466 8 : return generic_array_bisect(f,
2467 8 : le64toh(f->header->entry_array_offset),
2468 8 : le64toh(f->header->n_entries),
2469 : seqnum,
2470 : test_object_seqnum,
2471 : direction,
2472 : ret, offset, NULL);
2473 : }
2474 :
2475 6 : static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2476 : Object *o;
2477 : int r;
2478 :
2479 6 : assert(f);
2480 6 : assert(p > 0);
2481 :
2482 6 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2483 6 : if (r < 0)
2484 0 : return r;
2485 :
2486 6 : if (le64toh(o->entry.realtime) == needle)
2487 0 : return TEST_FOUND;
2488 6 : else if (le64toh(o->entry.realtime) < needle)
2489 2 : return TEST_LEFT;
2490 : else
2491 4 : return TEST_RIGHT;
2492 : }
2493 :
2494 4 : int journal_file_move_to_entry_by_realtime(
2495 : JournalFile *f,
2496 : uint64_t realtime,
2497 : direction_t direction,
2498 : Object **ret,
2499 : uint64_t *offset) {
2500 4 : assert(f);
2501 4 : assert(f->header);
2502 :
2503 4 : return generic_array_bisect(f,
2504 4 : le64toh(f->header->entry_array_offset),
2505 4 : le64toh(f->header->n_entries),
2506 : realtime,
2507 : test_object_realtime,
2508 : direction,
2509 : ret, offset, NULL);
2510 : }
2511 :
2512 0 : static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2513 : Object *o;
2514 : int r;
2515 :
2516 0 : assert(f);
2517 0 : assert(p > 0);
2518 :
2519 0 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2520 0 : if (r < 0)
2521 0 : return r;
2522 :
2523 0 : if (le64toh(o->entry.monotonic) == needle)
2524 0 : return TEST_FOUND;
2525 0 : else if (le64toh(o->entry.monotonic) < needle)
2526 0 : return TEST_LEFT;
2527 : else
2528 0 : return TEST_RIGHT;
2529 : }
2530 :
2531 4 : static int find_data_object_by_boot_id(
2532 : JournalFile *f,
2533 : sd_id128_t boot_id,
2534 : Object **o,
2535 : uint64_t *b) {
2536 :
2537 4 : char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2538 :
2539 4 : sd_id128_to_string(boot_id, t + 9);
2540 4 : return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2541 : }
2542 :
2543 4 : int journal_file_move_to_entry_by_monotonic(
2544 : JournalFile *f,
2545 : sd_id128_t boot_id,
2546 : uint64_t monotonic,
2547 : direction_t direction,
2548 : Object **ret,
2549 : uint64_t *offset) {
2550 :
2551 : Object *o;
2552 : int r;
2553 :
2554 4 : assert(f);
2555 :
2556 4 : r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2557 4 : if (r < 0)
2558 0 : return r;
2559 4 : if (r == 0)
2560 4 : return -ENOENT;
2561 :
2562 0 : return generic_array_bisect_plus_one(f,
2563 0 : le64toh(o->data.entry_offset),
2564 0 : le64toh(o->data.entry_array_offset),
2565 0 : le64toh(o->data.n_entries),
2566 : monotonic,
2567 : test_object_monotonic,
2568 : direction,
2569 : ret, offset, NULL);
2570 : }
2571 :
2572 11974 : void journal_file_reset_location(JournalFile *f) {
2573 11974 : f->location_type = LOCATION_HEAD;
2574 11974 : f->current_offset = 0;
2575 11974 : f->current_seqnum = 0;
2576 11974 : f->current_realtime = 0;
2577 11974 : f->current_monotonic = 0;
2578 11974 : zero(f->current_boot_id);
2579 11974 : f->current_xor_hash = 0;
2580 11974 : }
2581 :
2582 10611 : void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2583 10611 : f->location_type = LOCATION_SEEK;
2584 10611 : f->current_offset = offset;
2585 10611 : f->current_seqnum = le64toh(o->entry.seqnum);
2586 10611 : f->current_realtime = le64toh(o->entry.realtime);
2587 10611 : f->current_monotonic = le64toh(o->entry.monotonic);
2588 10611 : f->current_boot_id = o->entry.boot_id;
2589 10611 : f->current_xor_hash = le64toh(o->entry.xor_hash);
2590 10611 : }
2591 :
2592 991152 : int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2593 : int r;
2594 :
2595 991152 : assert(af);
2596 991152 : assert(af->header);
2597 991152 : assert(bf);
2598 991152 : assert(bf->header);
2599 991152 : assert(af->location_type == LOCATION_SEEK);
2600 991152 : assert(bf->location_type == LOCATION_SEEK);
2601 :
2602 : /* If contents and timestamps match, these entries are
2603 : * identical, even if the seqnum does not match */
2604 991152 : if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2605 20682 : af->current_monotonic == bf->current_monotonic &&
2606 82 : af->current_realtime == bf->current_realtime &&
2607 82 : af->current_xor_hash == bf->current_xor_hash)
2608 82 : return 0;
2609 :
2610 991070 : if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2611 :
2612 : /* If this is from the same seqnum source, compare
2613 : * seqnums */
2614 9697 : r = CMP(af->current_seqnum, bf->current_seqnum);
2615 9697 : if (r != 0)
2616 9697 : return r;
2617 :
2618 : /* Wow! This is weird, different data but the same
2619 : * seqnums? Something is borked, but let's make the
2620 : * best of it and compare by time. */
2621 : }
2622 :
2623 981373 : if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2624 :
2625 : /* If the boot id matches, compare monotonic time */
2626 20570 : r = CMP(af->current_monotonic, bf->current_monotonic);
2627 20570 : if (r != 0)
2628 20570 : return r;
2629 : }
2630 :
2631 : /* Otherwise, compare UTC time */
2632 960803 : r = CMP(af->current_realtime, bf->current_realtime);
2633 960803 : if (r != 0)
2634 960803 : return r;
2635 :
2636 : /* Finally, compare by contents */
2637 0 : return CMP(af->current_xor_hash, bf->current_xor_hash);
2638 : }
2639 :
2640 10744 : static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2641 :
2642 : /* Increase or decrease the specified index, in the right direction. */
2643 :
2644 10744 : if (direction == DIRECTION_DOWN) {
2645 10284 : if (*i >= n - 1)
2646 14 : return 0;
2647 :
2648 10270 : (*i) ++;
2649 : } else {
2650 460 : if (*i <= 0)
2651 10 : return 0;
2652 :
2653 450 : (*i) --;
2654 : }
2655 :
2656 10720 : return 1;
2657 : }
2658 :
2659 10282 : static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2660 :
2661 : /* Consider it an error if any of the two offsets is uninitialized */
2662 10282 : if (old_offset == 0 || new_offset == 0)
2663 0 : return false;
2664 :
2665 : /* If we go down, the new offset must be larger than the old one. */
2666 : return direction == DIRECTION_DOWN ?
2667 10282 : new_offset > old_offset :
2668 : new_offset < old_offset;
2669 : }
2670 :
2671 10428 : int journal_file_next_entry(
2672 : JournalFile *f,
2673 : uint64_t p,
2674 : direction_t direction,
2675 : Object **ret, uint64_t *offset) {
2676 :
2677 : uint64_t i, n, ofs;
2678 : int r;
2679 :
2680 10428 : assert(f);
2681 10428 : assert(f->header);
2682 :
2683 10428 : n = le64toh(f->header->n_entries);
2684 10428 : if (n <= 0)
2685 1 : return 0;
2686 :
2687 10427 : if (p == 0)
2688 121 : i = direction == DIRECTION_DOWN ? 0 : n - 1;
2689 : else {
2690 10306 : r = generic_array_bisect(f,
2691 10306 : le64toh(f->header->entry_array_offset),
2692 10306 : le64toh(f->header->n_entries),
2693 : p,
2694 : test_object_offset,
2695 : DIRECTION_DOWN,
2696 : NULL, NULL,
2697 : &i);
2698 10306 : if (r <= 0)
2699 0 : return r;
2700 :
2701 10306 : r = bump_array_index(&i, direction, n);
2702 10306 : if (r <= 0)
2703 24 : return r;
2704 : }
2705 :
2706 : /* And jump to it */
2707 : for (;;) {
2708 10403 : r = generic_array_get(f,
2709 10403 : le64toh(f->header->entry_array_offset),
2710 : i,
2711 : ret, &ofs);
2712 10403 : if (r > 0)
2713 10403 : break;
2714 0 : if (r != -EBADMSG)
2715 0 : return r;
2716 :
2717 : /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2718 : * the next one might work for us instead. */
2719 0 : log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2720 :
2721 0 : r = bump_array_index(&i, direction, n);
2722 0 : if (r <= 0)
2723 0 : return r;
2724 : }
2725 :
2726 : /* Ensure our array is properly ordered. */
2727 10403 : if (p > 0 && !check_properly_ordered(ofs, p, direction))
2728 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2729 : "%s: entry array not properly ordered at entry %" PRIu64,
2730 : f->path, i);
2731 :
2732 10403 : if (offset)
2733 10403 : *offset = ofs;
2734 :
2735 10403 : return 1;
2736 : }
2737 :
2738 121 : int journal_file_next_entry_for_data(
2739 : JournalFile *f,
2740 : Object *o, uint64_t p,
2741 : uint64_t data_offset,
2742 : direction_t direction,
2743 : Object **ret, uint64_t *offset) {
2744 :
2745 : uint64_t i, n, ofs;
2746 : Object *d;
2747 : int r;
2748 :
2749 121 : assert(f);
2750 121 : assert(p > 0 || !o);
2751 :
2752 121 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2753 121 : if (r < 0)
2754 0 : return r;
2755 :
2756 121 : n = le64toh(d->data.n_entries);
2757 121 : if (n <= 0)
2758 0 : return n;
2759 :
2760 121 : if (!o)
2761 121 : i = direction == DIRECTION_DOWN ? 0 : n - 1;
2762 : else {
2763 0 : if (o->object.type != OBJECT_ENTRY)
2764 0 : return -EINVAL;
2765 :
2766 0 : r = generic_array_bisect_plus_one(f,
2767 0 : le64toh(d->data.entry_offset),
2768 0 : le64toh(d->data.entry_array_offset),
2769 0 : le64toh(d->data.n_entries),
2770 : p,
2771 : test_object_offset,
2772 : DIRECTION_DOWN,
2773 : NULL, NULL,
2774 : &i);
2775 :
2776 0 : if (r <= 0)
2777 0 : return r;
2778 :
2779 0 : r = bump_array_index(&i, direction, n);
2780 0 : if (r <= 0)
2781 0 : return r;
2782 : }
2783 :
2784 : for (;;) {
2785 997 : r = generic_array_get_plus_one(f,
2786 559 : le64toh(d->data.entry_offset),
2787 559 : le64toh(d->data.entry_array_offset),
2788 : i,
2789 : ret, &ofs);
2790 559 : if (r > 0)
2791 121 : break;
2792 438 : if (r != -EBADMSG)
2793 0 : return r;
2794 :
2795 438 : log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2796 :
2797 438 : r = bump_array_index(&i, direction, n);
2798 438 : if (r <= 0)
2799 0 : return r;
2800 : }
2801 :
2802 : /* Ensure our array is properly ordered. */
2803 121 : if (p > 0 && check_properly_ordered(ofs, p, direction))
2804 0 : return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2805 : "%s data entry array not properly ordered at entry %" PRIu64,
2806 : f->path, i);
2807 :
2808 121 : if (offset)
2809 117 : *offset = ofs;
2810 :
2811 121 : return 1;
2812 : }
2813 :
2814 424 : int journal_file_move_to_entry_by_offset_for_data(
2815 : JournalFile *f,
2816 : uint64_t data_offset,
2817 : uint64_t p,
2818 : direction_t direction,
2819 : Object **ret, uint64_t *offset) {
2820 :
2821 : int r;
2822 : Object *d;
2823 :
2824 424 : assert(f);
2825 :
2826 424 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2827 424 : if (r < 0)
2828 0 : return r;
2829 :
2830 424 : return generic_array_bisect_plus_one(f,
2831 424 : le64toh(d->data.entry_offset),
2832 424 : le64toh(d->data.entry_array_offset),
2833 424 : le64toh(d->data.n_entries),
2834 : p,
2835 : test_object_offset,
2836 : direction,
2837 : ret, offset, NULL);
2838 : }
2839 :
2840 0 : int journal_file_move_to_entry_by_monotonic_for_data(
2841 : JournalFile *f,
2842 : uint64_t data_offset,
2843 : sd_id128_t boot_id,
2844 : uint64_t monotonic,
2845 : direction_t direction,
2846 : Object **ret, uint64_t *offset) {
2847 :
2848 : Object *o, *d;
2849 : int r;
2850 : uint64_t b, z;
2851 :
2852 0 : assert(f);
2853 :
2854 : /* First, seek by time */
2855 0 : r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2856 0 : if (r < 0)
2857 0 : return r;
2858 0 : if (r == 0)
2859 0 : return -ENOENT;
2860 :
2861 0 : r = generic_array_bisect_plus_one(f,
2862 0 : le64toh(o->data.entry_offset),
2863 0 : le64toh(o->data.entry_array_offset),
2864 0 : le64toh(o->data.n_entries),
2865 : monotonic,
2866 : test_object_monotonic,
2867 : direction,
2868 : NULL, &z, NULL);
2869 0 : if (r <= 0)
2870 0 : return r;
2871 :
2872 : /* And now, continue seeking until we find an entry that
2873 : * exists in both bisection arrays */
2874 :
2875 0 : for (;;) {
2876 : Object *qo;
2877 : uint64_t p, q;
2878 :
2879 0 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2880 0 : if (r < 0)
2881 0 : return r;
2882 :
2883 0 : r = generic_array_bisect_plus_one(f,
2884 0 : le64toh(d->data.entry_offset),
2885 0 : le64toh(d->data.entry_array_offset),
2886 0 : le64toh(d->data.n_entries),
2887 : z,
2888 : test_object_offset,
2889 : direction,
2890 : NULL, &p, NULL);
2891 0 : if (r <= 0)
2892 0 : return r;
2893 :
2894 0 : r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2895 0 : if (r < 0)
2896 0 : return r;
2897 :
2898 0 : r = generic_array_bisect_plus_one(f,
2899 0 : le64toh(o->data.entry_offset),
2900 0 : le64toh(o->data.entry_array_offset),
2901 0 : le64toh(o->data.n_entries),
2902 : p,
2903 : test_object_offset,
2904 : direction,
2905 : &qo, &q, NULL);
2906 :
2907 0 : if (r <= 0)
2908 0 : return r;
2909 :
2910 0 : if (p == q) {
2911 0 : if (ret)
2912 0 : *ret = qo;
2913 0 : if (offset)
2914 0 : *offset = q;
2915 :
2916 0 : return 1;
2917 : }
2918 :
2919 0 : z = q;
2920 : }
2921 : }
2922 :
2923 0 : int journal_file_move_to_entry_by_seqnum_for_data(
2924 : JournalFile *f,
2925 : uint64_t data_offset,
2926 : uint64_t seqnum,
2927 : direction_t direction,
2928 : Object **ret, uint64_t *offset) {
2929 :
2930 : Object *d;
2931 : int r;
2932 :
2933 0 : assert(f);
2934 :
2935 0 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2936 0 : if (r < 0)
2937 0 : return r;
2938 :
2939 0 : return generic_array_bisect_plus_one(f,
2940 0 : le64toh(d->data.entry_offset),
2941 0 : le64toh(d->data.entry_array_offset),
2942 0 : le64toh(d->data.n_entries),
2943 : seqnum,
2944 : test_object_seqnum,
2945 : direction,
2946 : ret, offset, NULL);
2947 : }
2948 :
2949 0 : int journal_file_move_to_entry_by_realtime_for_data(
2950 : JournalFile *f,
2951 : uint64_t data_offset,
2952 : uint64_t realtime,
2953 : direction_t direction,
2954 : Object **ret, uint64_t *offset) {
2955 :
2956 : Object *d;
2957 : int r;
2958 :
2959 0 : assert(f);
2960 :
2961 0 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2962 0 : if (r < 0)
2963 0 : return r;
2964 :
2965 0 : return generic_array_bisect_plus_one(f,
2966 0 : le64toh(d->data.entry_offset),
2967 0 : le64toh(d->data.entry_array_offset),
2968 0 : le64toh(d->data.n_entries),
2969 : realtime,
2970 : test_object_realtime,
2971 : direction,
2972 : ret, offset, NULL);
2973 : }
2974 :
2975 9 : void journal_file_dump(JournalFile *f) {
2976 : Object *o;
2977 : int r;
2978 : uint64_t p;
2979 :
2980 9 : assert(f);
2981 9 : assert(f->header);
2982 :
2983 9 : journal_file_print_header(f);
2984 :
2985 9 : p = le64toh(f->header->header_size);
2986 6451 : while (p != 0) {
2987 6442 : r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2988 6442 : if (r < 0)
2989 0 : goto fail;
2990 :
2991 6442 : switch (o->object.type) {
2992 :
2993 0 : case OBJECT_UNUSED:
2994 0 : printf("Type: OBJECT_UNUSED\n");
2995 0 : break;
2996 :
2997 86 : case OBJECT_DATA:
2998 86 : printf("Type: OBJECT_DATA\n");
2999 86 : break;
3000 :
3001 3 : case OBJECT_FIELD:
3002 3 : printf("Type: OBJECT_FIELD\n");
3003 3 : break;
3004 :
3005 6010 : case OBJECT_ENTRY:
3006 6010 : printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3007 6010 : le64toh(o->entry.seqnum),
3008 6010 : le64toh(o->entry.monotonic),
3009 6010 : le64toh(o->entry.realtime));
3010 6010 : break;
3011 :
3012 9 : case OBJECT_FIELD_HASH_TABLE:
3013 9 : printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3014 9 : break;
3015 :
3016 9 : case OBJECT_DATA_HASH_TABLE:
3017 9 : printf("Type: OBJECT_DATA_HASH_TABLE\n");
3018 9 : break;
3019 :
3020 325 : case OBJECT_ENTRY_ARRAY:
3021 325 : printf("Type: OBJECT_ENTRY_ARRAY\n");
3022 325 : break;
3023 :
3024 0 : case OBJECT_TAG:
3025 0 : printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3026 0 : le64toh(o->tag.seqnum),
3027 0 : le64toh(o->tag.epoch));
3028 0 : break;
3029 :
3030 0 : default:
3031 0 : printf("Type: unknown (%i)\n", o->object.type);
3032 0 : break;
3033 : }
3034 :
3035 6442 : if (o->object.flags & OBJECT_COMPRESSION_MASK)
3036 4 : printf("Flags: %s\n",
3037 4 : object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3038 :
3039 6442 : if (p == le64toh(f->header->tail_object_offset))
3040 9 : p = 0;
3041 : else
3042 6433 : p = p + ALIGN64(le64toh(o->object.size));
3043 : }
3044 :
3045 9 : return;
3046 0 : fail:
3047 0 : log_error("File corrupt");
3048 : }
3049 :
3050 26 : static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3051 : const char *x;
3052 :
3053 26 : x = format_timestamp(buf, l, t);
3054 26 : if (x)
3055 18 : return x;
3056 8 : return " --- ";
3057 : }
3058 :
3059 13 : void journal_file_print_header(JournalFile *f) {
3060 : char a[33], b[33], c[33], d[33];
3061 : char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3062 : struct stat st;
3063 : char bytes[FORMAT_BYTES_MAX];
3064 :
3065 13 : assert(f);
3066 13 : assert(f->header);
3067 :
3068 143 : printf("File Path: %s\n"
3069 : "File ID: %s\n"
3070 : "Machine ID: %s\n"
3071 : "Boot ID: %s\n"
3072 : "Sequential Number ID: %s\n"
3073 : "State: %s\n"
3074 : "Compatible Flags:%s%s\n"
3075 : "Incompatible Flags:%s%s%s\n"
3076 : "Header size: %"PRIu64"\n"
3077 : "Arena size: %"PRIu64"\n"
3078 : "Data Hash Table Size: %"PRIu64"\n"
3079 : "Field Hash Table Size: %"PRIu64"\n"
3080 : "Rotate Suggested: %s\n"
3081 : "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3082 : "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3083 : "Head Realtime Timestamp: %s (%"PRIx64")\n"
3084 : "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3085 : "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3086 : "Objects: %"PRIu64"\n"
3087 : "Entry Objects: %"PRIu64"\n",
3088 : f->path,
3089 13 : sd_id128_to_string(f->header->file_id, a),
3090 13 : sd_id128_to_string(f->header->machine_id, b),
3091 13 : sd_id128_to_string(f->header->boot_id, c),
3092 13 : sd_id128_to_string(f->header->seqnum_id, d),
3093 13 : f->header->state == STATE_OFFLINE ? "OFFLINE" :
3094 12 : f->header->state == STATE_ONLINE ? "ONLINE" :
3095 0 : f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3096 13 : JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3097 13 : (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3098 13 : JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3099 13 : JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3100 13 : (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3101 13 : le64toh(f->header->header_size),
3102 13 : le64toh(f->header->arena_size),
3103 13 : le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3104 13 : le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3105 13 : yes_no(journal_file_rotate_suggested(f, 0)),
3106 13 : le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3107 13 : le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3108 13 : format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3109 13 : format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3110 13 : format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3111 13 : le64toh(f->header->n_objects),
3112 13 : le64toh(f->header->n_entries));
3113 :
3114 13 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3115 13 : printf("Data Objects: %"PRIu64"\n"
3116 : "Data Hash Table Fill: %.1f%%\n",
3117 13 : le64toh(f->header->n_data),
3118 13 : 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3119 :
3120 13 : if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3121 13 : printf("Field Objects: %"PRIu64"\n"
3122 : "Field Hash Table Fill: %.1f%%\n",
3123 13 : le64toh(f->header->n_fields),
3124 13 : 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3125 :
3126 13 : if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3127 13 : printf("Tag Objects: %"PRIu64"\n",
3128 13 : le64toh(f->header->n_tags));
3129 13 : if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3130 13 : printf("Entry Array Objects: %"PRIu64"\n",
3131 13 : le64toh(f->header->n_entry_arrays));
3132 :
3133 13 : if (fstat(f->fd, &st) >= 0)
3134 13 : printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3135 13 : }
3136 :
3137 25 : static int journal_file_warn_btrfs(JournalFile *f) {
3138 : unsigned attrs;
3139 : int r;
3140 :
3141 25 : assert(f);
3142 :
3143 : /* Before we write anything, check if the COW logic is turned
3144 : * off on btrfs. Given our write pattern that is quite
3145 : * unfriendly to COW file systems this should greatly improve
3146 : * performance on COW file systems, such as btrfs, at the
3147 : * expense of data integrity features (which shouldn't be too
3148 : * bad, given that we do our own checksumming). */
3149 :
3150 25 : r = btrfs_is_filesystem(f->fd);
3151 25 : if (r < 0)
3152 0 : return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3153 25 : if (!r)
3154 25 : return 0;
3155 :
3156 0 : r = read_attr_fd(f->fd, &attrs);
3157 0 : if (r < 0)
3158 0 : return log_warning_errno(r, "Failed to read file attributes: %m");
3159 :
3160 0 : if (attrs & FS_NOCOW_FL) {
3161 0 : log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3162 0 : return 0;
3163 : }
3164 :
3165 0 : log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3166 : "This is likely to slow down journal access substantially, please consider turning "
3167 : "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3168 :
3169 0 : return 1;
3170 : }
3171 :
3172 9843 : int journal_file_open(
3173 : int fd,
3174 : const char *fname,
3175 : int flags,
3176 : mode_t mode,
3177 : bool compress,
3178 : uint64_t compress_threshold_bytes,
3179 : bool seal,
3180 : JournalMetrics *metrics,
3181 : MMapCache *mmap_cache,
3182 : Set *deferred_closes,
3183 : JournalFile *template,
3184 : JournalFile **ret) {
3185 :
3186 9843 : bool newly_created = false;
3187 : JournalFile *f;
3188 : void *h;
3189 : int r;
3190 :
3191 9843 : assert(ret);
3192 9843 : assert(fd >= 0 || fname);
3193 :
3194 9843 : if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3195 0 : return -EINVAL;
3196 :
3197 9843 : if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3198 0 : return -EINVAL;
3199 :
3200 9843 : f = new(JournalFile, 1);
3201 9843 : if (!f)
3202 0 : return -ENOMEM;
3203 :
3204 9843 : *f = (JournalFile) {
3205 : .fd = fd,
3206 : .mode = mode,
3207 :
3208 : .flags = flags,
3209 9843 : .prot = prot_from_flags(flags),
3210 9843 : .writable = (flags & O_ACCMODE) != O_RDONLY,
3211 :
3212 : #if HAVE_LZ4
3213 : .compress_lz4 = compress,
3214 : #elif HAVE_XZ
3215 : .compress_xz = compress,
3216 : #endif
3217 : .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
3218 9843 : DEFAULT_COMPRESS_THRESHOLD :
3219 9822 : MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3220 : #if HAVE_GCRYPT
3221 : .seal = seal,
3222 : #endif
3223 : };
3224 :
3225 9843 : if (DEBUG_LOGGING) {
3226 : static int last_seal = -1, last_compress = -1;
3227 : static uint64_t last_bytes = UINT64_MAX;
3228 : char bytes[FORMAT_BYTES_MAX];
3229 :
3230 9741 : if (last_seal != f->seal ||
3231 9732 : last_compress != JOURNAL_FILE_COMPRESS(f) ||
3232 9725 : last_bytes != f->compress_threshold_bytes) {
3233 :
3234 18 : log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3235 : yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3236 : format_bytes(bytes, sizeof bytes, f->compress_threshold_bytes));
3237 18 : last_seal = f->seal;
3238 18 : last_compress = JOURNAL_FILE_COMPRESS(f);
3239 18 : last_bytes = f->compress_threshold_bytes;
3240 : }
3241 : }
3242 :
3243 9843 : if (mmap_cache)
3244 9818 : f->mmap = mmap_cache_ref(mmap_cache);
3245 : else {
3246 25 : f->mmap = mmap_cache_new();
3247 25 : if (!f->mmap) {
3248 0 : r = -ENOMEM;
3249 0 : goto fail;
3250 : }
3251 : }
3252 :
3253 9843 : if (fname) {
3254 9843 : f->path = strdup(fname);
3255 9843 : if (!f->path) {
3256 0 : r = -ENOMEM;
3257 0 : goto fail;
3258 : }
3259 : } else {
3260 0 : assert(fd >= 0);
3261 :
3262 : /* If we don't know the path, fill in something explanatory and vaguely useful */
3263 0 : if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3264 0 : r = -ENOMEM;
3265 0 : goto fail;
3266 : }
3267 : }
3268 :
3269 9843 : f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3270 9843 : if (!f->chain_cache) {
3271 0 : r = -ENOMEM;
3272 0 : goto fail;
3273 : }
3274 :
3275 9843 : if (f->fd < 0) {
3276 : /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3277 : * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3278 : * it doesn't hurt in that case. */
3279 :
3280 27 : f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3281 27 : if (f->fd < 0) {
3282 0 : r = -errno;
3283 0 : goto fail;
3284 : }
3285 :
3286 : /* fds we opened here by us should also be closed by us. */
3287 27 : f->close_fd = true;
3288 :
3289 27 : r = fd_nonblock(f->fd, false);
3290 27 : if (r < 0)
3291 0 : goto fail;
3292 : }
3293 :
3294 9843 : f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3295 9843 : if (!f->cache_fd) {
3296 0 : r = -ENOMEM;
3297 0 : goto fail;
3298 : }
3299 :
3300 9843 : r = journal_file_fstat(f);
3301 9843 : if (r < 0)
3302 0 : goto fail;
3303 :
3304 9843 : if (f->last_stat.st_size == 0 && f->writable) {
3305 :
3306 25 : (void) journal_file_warn_btrfs(f);
3307 :
3308 : /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3309 : * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3310 : * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3311 : * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3312 : * solely on mtime/atime/ctime of the file. */
3313 25 : (void) fd_setcrtime(f->fd, 0);
3314 :
3315 : #if HAVE_GCRYPT
3316 : /* Try to load the FSPRG state, and if we can't, then
3317 : * just don't do sealing */
3318 25 : if (f->seal) {
3319 12 : r = journal_file_fss_load(f);
3320 12 : if (r < 0)
3321 12 : f->seal = false;
3322 : }
3323 : #endif
3324 :
3325 25 : r = journal_file_init_header(f, template);
3326 25 : if (r < 0)
3327 0 : goto fail;
3328 :
3329 25 : r = journal_file_fstat(f);
3330 25 : if (r < 0)
3331 0 : goto fail;
3332 :
3333 25 : newly_created = true;
3334 : }
3335 :
3336 9843 : if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3337 0 : r = -ENODATA;
3338 0 : goto fail;
3339 : }
3340 :
3341 9843 : r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3342 9843 : if (r == -EINVAL) {
3343 : /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3344 : * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3345 : * code. */
3346 0 : r = -EAFNOSUPPORT;
3347 0 : goto fail;
3348 : }
3349 9843 : if (r < 0)
3350 0 : goto fail;
3351 :
3352 9843 : f->header = h;
3353 :
3354 9843 : if (!newly_created) {
3355 9818 : set_clear_with_destructor(deferred_closes, journal_file_close);
3356 :
3357 9818 : r = journal_file_verify_header(f);
3358 9818 : if (r < 0)
3359 0 : goto fail;
3360 : }
3361 :
3362 : #if HAVE_GCRYPT
3363 9843 : if (!newly_created && f->writable) {
3364 1 : r = journal_file_fss_load(f);
3365 1 : if (r < 0)
3366 0 : goto fail;
3367 : }
3368 : #endif
3369 :
3370 9843 : if (f->writable) {
3371 26 : if (metrics) {
3372 0 : journal_default_metrics(metrics, f->fd);
3373 0 : f->metrics = *metrics;
3374 26 : } else if (template)
3375 3 : f->metrics = template->metrics;
3376 :
3377 26 : r = journal_file_refresh_header(f);
3378 26 : if (r < 0)
3379 0 : goto fail;
3380 : }
3381 :
3382 : #if HAVE_GCRYPT
3383 9843 : r = journal_file_hmac_setup(f);
3384 9843 : if (r < 0)
3385 0 : goto fail;
3386 : #endif
3387 :
3388 9843 : if (newly_created) {
3389 25 : r = journal_file_setup_field_hash_table(f);
3390 25 : if (r < 0)
3391 0 : goto fail;
3392 :
3393 25 : r = journal_file_setup_data_hash_table(f);
3394 25 : if (r < 0)
3395 0 : goto fail;
3396 :
3397 : #if HAVE_GCRYPT
3398 25 : r = journal_file_append_first_tag(f);
3399 25 : if (r < 0)
3400 0 : goto fail;
3401 : #endif
3402 : }
3403 :
3404 9843 : if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3405 0 : r = -EIO;
3406 0 : goto fail;
3407 : }
3408 :
3409 9843 : if (template && template->post_change_timer) {
3410 0 : r = journal_file_enable_post_change_timer(
3411 : f,
3412 : sd_event_source_get_event(template->post_change_timer),
3413 : template->post_change_timer_period);
3414 :
3415 0 : if (r < 0)
3416 0 : goto fail;
3417 : }
3418 :
3419 : /* The file is opened now successfully, thus we take possession of any passed in fd. */
3420 9843 : f->close_fd = true;
3421 :
3422 9843 : *ret = f;
3423 9843 : return 0;
3424 :
3425 0 : fail:
3426 0 : if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3427 0 : r = -EIO;
3428 :
3429 0 : (void) journal_file_close(f);
3430 :
3431 0 : return r;
3432 : }
3433 :
3434 2 : int journal_file_archive(JournalFile *f) {
3435 2 : _cleanup_free_ char *p = NULL;
3436 :
3437 2 : assert(f);
3438 :
3439 2 : if (!f->writable)
3440 0 : return -EINVAL;
3441 :
3442 : /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3443 : * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3444 2 : if (path_startswith(f->path, "/proc/self/fd"))
3445 0 : return -EINVAL;
3446 :
3447 2 : if (!endswith(f->path, ".journal"))
3448 0 : return -EINVAL;
3449 :
3450 2 : if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3451 2 : (int) strlen(f->path) - 8, f->path,
3452 2 : SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3453 2 : le64toh(f->header->head_entry_seqnum),
3454 2 : le64toh(f->header->head_entry_realtime)) < 0)
3455 0 : return -ENOMEM;
3456 :
3457 : /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3458 : * ignore that case. */
3459 2 : if (rename(f->path, p) < 0 && errno != ENOENT)
3460 0 : return -errno;
3461 :
3462 : /* Sync the rename to disk */
3463 2 : (void) fsync_directory_of_file(f->fd);
3464 :
3465 : /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3466 : * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3467 : * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3468 : * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3469 : * occurs. */
3470 2 : f->archive = true;
3471 :
3472 : /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3473 : * files when we archive them */
3474 2 : f->defrag_on_close = true;
3475 :
3476 2 : return 0;
3477 : }
3478 :
3479 2 : JournalFile* journal_initiate_close(
3480 : JournalFile *f,
3481 : Set *deferred_closes) {
3482 :
3483 : int r;
3484 :
3485 2 : assert(f);
3486 :
3487 2 : if (deferred_closes) {
3488 :
3489 0 : r = set_put(deferred_closes, f);
3490 0 : if (r < 0)
3491 0 : log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
3492 : else {
3493 0 : (void) journal_file_set_offline(f, false);
3494 0 : return NULL;
3495 : }
3496 : }
3497 :
3498 2 : return journal_file_close(f);
3499 : }
3500 :
3501 2 : int journal_file_rotate(
3502 : JournalFile **f,
3503 : bool compress,
3504 : uint64_t compress_threshold_bytes,
3505 : bool seal,
3506 : Set *deferred_closes) {
3507 :
3508 2 : JournalFile *new_file = NULL;
3509 : int r;
3510 :
3511 2 : assert(f);
3512 2 : assert(*f);
3513 :
3514 2 : r = journal_file_archive(*f);
3515 2 : if (r < 0)
3516 0 : return r;
3517 :
3518 10 : r = journal_file_open(
3519 : -1,
3520 2 : (*f)->path,
3521 2 : (*f)->flags,
3522 2 : (*f)->mode,
3523 : compress,
3524 : compress_threshold_bytes,
3525 : seal,
3526 : NULL, /* metrics */
3527 2 : (*f)->mmap,
3528 : deferred_closes,
3529 : *f, /* template */
3530 : &new_file);
3531 :
3532 2 : journal_initiate_close(*f, deferred_closes);
3533 2 : *f = new_file;
3534 :
3535 2 : return r;
3536 : }
3537 :
3538 0 : int journal_file_dispose(int dir_fd, const char *fname) {
3539 0 : _cleanup_free_ char *p = NULL;
3540 0 : _cleanup_close_ int fd = -1;
3541 :
3542 0 : assert(fname);
3543 :
3544 : /* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that
3545 : * this is done without looking into the file or changing any of its contents. The idea is that this is called
3546 : * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3547 : * for writing anymore. */
3548 :
3549 0 : if (!endswith(fname, ".journal"))
3550 0 : return -EINVAL;
3551 :
3552 0 : if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3553 0 : (int) strlen(fname) - 8, fname,
3554 : now(CLOCK_REALTIME),
3555 : random_u64()) < 0)
3556 0 : return -ENOMEM;
3557 :
3558 0 : if (renameat(dir_fd, fname, dir_fd, p) < 0)
3559 0 : return -errno;
3560 :
3561 : /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3562 0 : fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3563 0 : if (fd < 0)
3564 0 : log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3565 : else {
3566 0 : (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3567 0 : (void) btrfs_defrag_fd(fd);
3568 : }
3569 :
3570 0 : return 0;
3571 : }
3572 :
3573 0 : int journal_file_open_reliably(
3574 : const char *fname,
3575 : int flags,
3576 : mode_t mode,
3577 : bool compress,
3578 : uint64_t compress_threshold_bytes,
3579 : bool seal,
3580 : JournalMetrics *metrics,
3581 : MMapCache *mmap_cache,
3582 : Set *deferred_closes,
3583 : JournalFile *template,
3584 : JournalFile **ret) {
3585 :
3586 : int r;
3587 :
3588 0 : r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3589 : deferred_closes, template, ret);
3590 0 : if (!IN_SET(r,
3591 : -EBADMSG, /* Corrupted */
3592 : -ENODATA, /* Truncated */
3593 : -EHOSTDOWN, /* Other machine */
3594 : -EPROTONOSUPPORT, /* Incompatible feature */
3595 : -EBUSY, /* Unclean shutdown */
3596 : -ESHUTDOWN, /* Already archived */
3597 : -EIO, /* IO error, including SIGBUS on mmap */
3598 : -EIDRM, /* File has been deleted */
3599 : -ETXTBSY)) /* File is from the future */
3600 0 : return r;
3601 :
3602 0 : if ((flags & O_ACCMODE) == O_RDONLY)
3603 0 : return r;
3604 :
3605 0 : if (!(flags & O_CREAT))
3606 0 : return r;
3607 :
3608 0 : if (!endswith(fname, ".journal"))
3609 0 : return r;
3610 :
3611 : /* The file is corrupted. Rotate it away and try it again (but only once) */
3612 0 : log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3613 :
3614 0 : r = journal_file_dispose(AT_FDCWD, fname);
3615 0 : if (r < 0)
3616 0 : return r;
3617 :
3618 0 : return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3619 : deferred_closes, template, ret);
3620 : }
3621 :
3622 10000 : int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3623 : uint64_t i, n;
3624 10000 : uint64_t q, xor_hash = 0;
3625 : int r;
3626 : EntryItem *items;
3627 : dual_timestamp ts;
3628 : const sd_id128_t *boot_id;
3629 :
3630 10000 : assert(from);
3631 10000 : assert(to);
3632 10000 : assert(o);
3633 10000 : assert(p);
3634 :
3635 10000 : if (!to->writable)
3636 0 : return -EPERM;
3637 :
3638 10000 : ts.monotonic = le64toh(o->entry.monotonic);
3639 10000 : ts.realtime = le64toh(o->entry.realtime);
3640 10000 : boot_id = &o->entry.boot_id;
3641 :
3642 10000 : n = journal_file_entry_n_items(o);
3643 : /* alloca() can't take 0, hence let's allocate at least one */
3644 10000 : items = newa(EntryItem, MAX(1u, n));
3645 :
3646 235468 : for (i = 0; i < n; i++) {
3647 : uint64_t l, h;
3648 : le64_t le_hash;
3649 : size_t t;
3650 : void *data;
3651 : Object *u;
3652 :
3653 225468 : q = le64toh(o->entry.items[i].object_offset);
3654 225468 : le_hash = o->entry.items[i].hash;
3655 :
3656 225468 : r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3657 225468 : if (r < 0)
3658 0 : return r;
3659 :
3660 225468 : if (le_hash != o->data.hash)
3661 0 : return -EBADMSG;
3662 :
3663 225468 : l = le64toh(o->object.size) - offsetof(Object, data.payload);
3664 225468 : t = (size_t) l;
3665 :
3666 : /* We hit the limit on 32bit machines */
3667 225468 : if ((uint64_t) t != l)
3668 0 : return -E2BIG;
3669 :
3670 225468 : if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3671 : #if HAVE_XZ || HAVE_LZ4
3672 34 : size_t rsize = 0;
3673 :
3674 68 : r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3675 34 : o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3676 34 : if (r < 0)
3677 0 : return r;
3678 :
3679 34 : data = from->compress_buffer;
3680 34 : l = rsize;
3681 : #else
3682 : return -EPROTONOSUPPORT;
3683 : #endif
3684 : } else
3685 225434 : data = o->data.payload;
3686 :
3687 225468 : r = journal_file_append_data(to, data, l, &u, &h);
3688 225468 : if (r < 0)
3689 0 : return r;
3690 :
3691 225468 : xor_hash ^= le64toh(u->data.hash);
3692 225468 : items[i].object_offset = htole64(h);
3693 225468 : items[i].hash = u->data.hash;
3694 :
3695 225468 : r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3696 225468 : if (r < 0)
3697 0 : return r;
3698 : }
3699 :
3700 10000 : r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3701 : NULL, NULL, NULL);
3702 :
3703 10000 : if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3704 0 : return -EIO;
3705 :
3706 10000 : return r;
3707 : }
3708 :
3709 0 : void journal_reset_metrics(JournalMetrics *m) {
3710 0 : assert(m);
3711 :
3712 : /* Set everything to "pick automatic values". */
3713 :
3714 0 : *m = (JournalMetrics) {
3715 : .min_use = (uint64_t) -1,
3716 : .max_use = (uint64_t) -1,
3717 : .min_size = (uint64_t) -1,
3718 : .max_size = (uint64_t) -1,
3719 : .keep_free = (uint64_t) -1,
3720 : .n_max_files = (uint64_t) -1,
3721 : };
3722 0 : }
3723 :
3724 0 : void journal_default_metrics(JournalMetrics *m, int fd) {
3725 : char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3726 : struct statvfs ss;
3727 0 : uint64_t fs_size = 0;
3728 :
3729 0 : assert(m);
3730 0 : assert(fd >= 0);
3731 :
3732 0 : if (fstatvfs(fd, &ss) >= 0)
3733 0 : fs_size = ss.f_frsize * ss.f_blocks;
3734 : else
3735 0 : log_debug_errno(errno, "Failed to determine disk size: %m");
3736 :
3737 0 : if (m->max_use == (uint64_t) -1) {
3738 :
3739 0 : if (fs_size > 0)
3740 0 : m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3741 : MAX_USE_LOWER, MAX_USE_UPPER);
3742 : else
3743 0 : m->max_use = MAX_USE_LOWER;
3744 : } else {
3745 0 : m->max_use = PAGE_ALIGN(m->max_use);
3746 :
3747 0 : if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3748 0 : m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3749 : }
3750 :
3751 0 : if (m->min_use == (uint64_t) -1) {
3752 0 : if (fs_size > 0)
3753 0 : m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3754 : MIN_USE_LOW, MIN_USE_HIGH);
3755 : else
3756 0 : m->min_use = MIN_USE_LOW;
3757 : }
3758 :
3759 0 : if (m->min_use > m->max_use)
3760 0 : m->min_use = m->max_use;
3761 :
3762 0 : if (m->max_size == (uint64_t) -1)
3763 0 : m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3764 : MAX_SIZE_UPPER);
3765 : else
3766 0 : m->max_size = PAGE_ALIGN(m->max_size);
3767 :
3768 0 : if (m->max_size != 0) {
3769 0 : if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3770 0 : m->max_size = JOURNAL_FILE_SIZE_MIN;
3771 :
3772 0 : if (m->max_use != 0 && m->max_size*2 > m->max_use)
3773 0 : m->max_use = m->max_size*2;
3774 : }
3775 :
3776 0 : if (m->min_size == (uint64_t) -1)
3777 0 : m->min_size = JOURNAL_FILE_SIZE_MIN;
3778 : else
3779 0 : m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3780 : JOURNAL_FILE_SIZE_MIN,
3781 : m->max_size ?: UINT64_MAX);
3782 :
3783 0 : if (m->keep_free == (uint64_t) -1) {
3784 0 : if (fs_size > 0)
3785 0 : m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3786 : KEEP_FREE_UPPER);
3787 : else
3788 0 : m->keep_free = DEFAULT_KEEP_FREE;
3789 : }
3790 :
3791 0 : if (m->n_max_files == (uint64_t) -1)
3792 0 : m->n_max_files = DEFAULT_N_MAX_FILES;
3793 :
3794 0 : log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3795 : format_bytes(a, sizeof(a), m->min_use),
3796 : format_bytes(b, sizeof(b), m->max_use),
3797 : format_bytes(c, sizeof(c), m->max_size),
3798 : format_bytes(d, sizeof(d), m->min_size),
3799 : format_bytes(e, sizeof(e), m->keep_free),
3800 : m->n_max_files);
3801 0 : }
3802 :
3803 0 : int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3804 0 : assert(f);
3805 0 : assert(f->header);
3806 0 : assert(from || to);
3807 :
3808 0 : if (from) {
3809 0 : if (f->header->head_entry_realtime == 0)
3810 0 : return -ENOENT;
3811 :
3812 0 : *from = le64toh(f->header->head_entry_realtime);
3813 : }
3814 :
3815 0 : if (to) {
3816 0 : if (f->header->tail_entry_realtime == 0)
3817 0 : return -ENOENT;
3818 :
3819 0 : *to = le64toh(f->header->tail_entry_realtime);
3820 : }
3821 :
3822 0 : return 1;
3823 : }
3824 :
3825 0 : int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3826 : Object *o;
3827 : uint64_t p;
3828 : int r;
3829 :
3830 0 : assert(f);
3831 0 : assert(from || to);
3832 :
3833 0 : r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3834 0 : if (r <= 0)
3835 0 : return r;
3836 :
3837 0 : if (le64toh(o->data.n_entries) <= 0)
3838 0 : return 0;
3839 :
3840 0 : if (from) {
3841 0 : r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3842 0 : if (r < 0)
3843 0 : return r;
3844 :
3845 0 : *from = le64toh(o->entry.monotonic);
3846 : }
3847 :
3848 0 : if (to) {
3849 0 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3850 0 : if (r < 0)
3851 0 : return r;
3852 :
3853 0 : r = generic_array_get_plus_one(f,
3854 0 : le64toh(o->data.entry_offset),
3855 0 : le64toh(o->data.entry_array_offset),
3856 0 : le64toh(o->data.n_entries)-1,
3857 : &o, NULL);
3858 0 : if (r <= 0)
3859 0 : return r;
3860 :
3861 0 : *to = le64toh(o->entry.monotonic);
3862 : }
3863 :
3864 0 : return 1;
3865 : }
3866 :
3867 13 : bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3868 13 : assert(f);
3869 13 : assert(f->header);
3870 :
3871 : /* If we gained new header fields we gained new features,
3872 : * hence suggest a rotation */
3873 13 : if (le64toh(f->header->header_size) < sizeof(Header)) {
3874 0 : log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3875 0 : return true;
3876 : }
3877 :
3878 : /* Let's check if the hash tables grew over a certain fill
3879 : * level (75%, borrowing this value from Java's hash table
3880 : * implementation), and if so suggest a rotation. To calculate
3881 : * the fill level we need the n_data field, which only exists
3882 : * in newer versions. */
3883 :
3884 13 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3885 13 : if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3886 0 : log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3887 : f->path,
3888 : 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3889 : le64toh(f->header->n_data),
3890 : le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3891 : (unsigned long long) f->last_stat.st_size,
3892 : f->last_stat.st_size / le64toh(f->header->n_data));
3893 0 : return true;
3894 : }
3895 :
3896 13 : if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3897 13 : if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3898 0 : log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3899 : f->path,
3900 : 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3901 : le64toh(f->header->n_fields),
3902 : le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3903 0 : return true;
3904 : }
3905 :
3906 : /* Are the data objects properly indexed by field objects? */
3907 26 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3908 26 : JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3909 22 : le64toh(f->header->n_data) > 0 &&
3910 9 : le64toh(f->header->n_fields) == 0)
3911 7 : return true;
3912 :
3913 6 : if (max_file_usec > 0) {
3914 : usec_t t, h;
3915 :
3916 0 : h = le64toh(f->header->head_entry_realtime);
3917 0 : t = now(CLOCK_REALTIME);
3918 :
3919 0 : if (h > 0 && t > h + max_file_usec)
3920 0 : return true;
3921 : }
3922 :
3923 6 : return false;
3924 : }
|