LCOV - code coverage report
Current view: top level - journal - journal-file.c (source / functions) Hit Total Coverage
Test: main_coverage.info Lines: 1274 1916 66.5 %
Date: 2019-08-22 15:41:25 Functions: 73 87 83.9 %

          Line data    Source code
       1             : /* SPDX-License-Identifier: LGPL-2.1+ */
       2             : 
       3             : #include <errno.h>
       4             : #include <fcntl.h>
       5             : #include <linux/fs.h>
       6             : #include <pthread.h>
       7             : #include <stddef.h>
       8             : #include <sys/mman.h>
       9             : #include <sys/statvfs.h>
      10             : #include <sys/uio.h>
      11             : #include <unistd.h>
      12             : 
      13             : #include "sd-event.h"
      14             : 
      15             : #include "alloc-util.h"
      16             : #include "btrfs-util.h"
      17             : #include "chattr-util.h"
      18             : #include "compress.h"
      19             : #include "fd-util.h"
      20             : #include "format-util.h"
      21             : #include "fs-util.h"
      22             : #include "journal-authenticate.h"
      23             : #include "journal-def.h"
      24             : #include "journal-file.h"
      25             : #include "lookup3.h"
      26             : #include "memory-util.h"
      27             : #include "path-util.h"
      28             : #include "random-util.h"
      29             : #include "set.h"
      30             : #include "sort-util.h"
      31             : #include "stat-util.h"
      32             : #include "string-util.h"
      33             : #include "strv.h"
      34             : #include "xattr-util.h"
      35             : 
      36             : #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
      37             : #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
      38             : 
      39             : #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
      40             : #define MIN_COMPRESS_THRESHOLD (8ULL)
      41             : 
      42             : /* This is the minimum journal file size */
      43             : #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL)             /* 512 KiB */
      44             : 
      45             : /* These are the lower and upper bounds if we deduce the max_use value
      46             :  * from the file system size */
      47             : #define MAX_USE_LOWER (1 * 1024 * 1024ULL)                /* 1 MiB */
      48             : #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL)         /* 4 GiB */
      49             : 
      50             : /* Those are the lower and upper bounds for the minimal use limit,
      51             :  * i.e. how much we'll use even if keep_free suggests otherwise. */
      52             : #define MIN_USE_LOW (1 * 1024 * 1024ULL)                  /* 1 MiB */
      53             : #define MIN_USE_HIGH (16 * 1024 * 1024ULL)                /* 16 MiB */
      54             : 
      55             : /* This is the upper bound if we deduce max_size from max_use */
      56             : #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL)             /* 128 MiB */
      57             : 
      58             : /* This is the upper bound if we deduce the keep_free value from the
      59             :  * file system size */
      60             : #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL)       /* 4 GiB */
      61             : 
      62             : /* This is the keep_free value when we can't determine the system
      63             :  * size */
      64             : #define DEFAULT_KEEP_FREE (1024 * 1024ULL)                /* 1 MB */
      65             : 
      66             : /* This is the default maximum number of journal files to keep around. */
      67             : #define DEFAULT_N_MAX_FILES 100
      68             : 
      69             : /* n_data was the first entry we added after the initial file format design */
      70             : #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
      71             : 
      72             : /* How many entries to keep in the entry array chain cache at max */
      73             : #define CHAIN_CACHE_MAX 20
      74             : 
      75             : /* How much to increase the journal file size at once each time we allocate something new. */
      76             : #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL)          /* 8MB */
      77             : 
      78             : /* Reread fstat() of the file for detecting deletions at least this often */
      79             : #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
      80             : 
      81             : /* The mmap context to use for the header we pick as one above the last defined typed */
      82             : #define CONTEXT_HEADER _OBJECT_TYPE_MAX
      83             : 
      84             : #ifdef __clang__
      85             : #  pragma GCC diagnostic ignored "-Waddress-of-packed-member"
      86             : #endif
      87             : 
      88             : /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
      89             :  * As a result we use atomic operations on f->offline_state for inter-thread communications with
      90             :  * journal_file_set_offline() and journal_file_set_online(). */
      91          26 : static void journal_file_set_offline_internal(JournalFile *f) {
      92          26 :         assert(f);
      93          26 :         assert(f->fd >= 0);
      94          26 :         assert(f->header);
      95             : 
      96             :         for (;;) {
      97          52 :                 switch (f->offline_state) {
      98           0 :                 case OFFLINE_CANCEL:
      99           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
     100           0 :                                 continue;
     101           0 :                         return;
     102             : 
     103           0 :                 case OFFLINE_AGAIN_FROM_SYNCING:
     104           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
     105           0 :                                 continue;
     106           0 :                         break;
     107             : 
     108           0 :                 case OFFLINE_AGAIN_FROM_OFFLINING:
     109           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
     110           0 :                                 continue;
     111           0 :                         break;
     112             : 
     113          26 :                 case OFFLINE_SYNCING:
     114          26 :                         (void) fsync(f->fd);
     115             : 
     116          26 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
     117           0 :                                 continue;
     118             : 
     119          26 :                         f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
     120          26 :                         (void) fsync(f->fd);
     121          26 :                         break;
     122             : 
     123          26 :                 case OFFLINE_OFFLINING:
     124          26 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
     125           0 :                                 continue;
     126             :                         _fallthrough_;
     127             :                 case OFFLINE_DONE:
     128          26 :                         return;
     129             : 
     130           0 :                 case OFFLINE_JOINED:
     131           0 :                         log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
     132           0 :                         return;
     133             :                 }
     134          26 :         }
     135             : }
     136             : 
     137           0 : static void * journal_file_set_offline_thread(void *arg) {
     138           0 :         JournalFile *f = arg;
     139             : 
     140           0 :         (void) pthread_setname_np(pthread_self(), "journal-offline");
     141             : 
     142           0 :         journal_file_set_offline_internal(f);
     143             : 
     144           0 :         return NULL;
     145             : }
     146             : 
     147          26 : static int journal_file_set_offline_thread_join(JournalFile *f) {
     148             :         int r;
     149             : 
     150          26 :         assert(f);
     151             : 
     152          26 :         if (f->offline_state == OFFLINE_JOINED)
     153          26 :                 return 0;
     154             : 
     155           0 :         r = pthread_join(f->offline_thread, NULL);
     156           0 :         if (r)
     157           0 :                 return -r;
     158             : 
     159           0 :         f->offline_state = OFFLINE_JOINED;
     160             : 
     161           0 :         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
     162           0 :                 return -EIO;
     163             : 
     164           0 :         return 0;
     165             : }
     166             : 
     167             : /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
     168          26 : static bool journal_file_set_offline_try_restart(JournalFile *f) {
     169             :         for (;;) {
     170          26 :                 switch (f->offline_state) {
     171           0 :                 case OFFLINE_AGAIN_FROM_SYNCING:
     172             :                 case OFFLINE_AGAIN_FROM_OFFLINING:
     173           0 :                         return true;
     174             : 
     175           0 :                 case OFFLINE_CANCEL:
     176           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
     177           0 :                                 continue;
     178           0 :                         return true;
     179             : 
     180           0 :                 case OFFLINE_SYNCING:
     181           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
     182           0 :                                 continue;
     183           0 :                         return true;
     184             : 
     185           0 :                 case OFFLINE_OFFLINING:
     186           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
     187           0 :                                 continue;
     188           0 :                         return true;
     189             : 
     190          26 :                 default:
     191          26 :                         return false;
     192             :                 }
     193             :         }
     194             : }
     195             : 
     196             : /* Sets a journal offline.
     197             :  *
     198             :  * If wait is false then an offline is dispatched in a separate thread for a
     199             :  * subsequent journal_file_set_offline() or journal_file_set_online() of the
     200             :  * same journal to synchronize with.
     201             :  *
     202             :  * If wait is true, then either an existing offline thread will be restarted
     203             :  * and joined, or if none exists the offline is simply performed in this
     204             :  * context without involving another thread.
     205             :  */
     206        9843 : int journal_file_set_offline(JournalFile *f, bool wait) {
     207             :         bool restarted;
     208             :         int r;
     209             : 
     210        9843 :         assert(f);
     211             : 
     212        9843 :         if (!f->writable)
     213        9817 :                 return -EPERM;
     214             : 
     215          26 :         if (f->fd < 0 || !f->header)
     216           0 :                 return -EINVAL;
     217             : 
     218             :         /* An offlining journal is implicitly online and may modify f->header->state,
     219             :          * we must also join any potentially lingering offline thread when not online. */
     220          26 :         if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
     221           0 :                 return journal_file_set_offline_thread_join(f);
     222             : 
     223             :         /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
     224          26 :         restarted = journal_file_set_offline_try_restart(f);
     225          26 :         if ((restarted && wait) || !restarted) {
     226          26 :                 r = journal_file_set_offline_thread_join(f);
     227          26 :                 if (r < 0)
     228           0 :                         return r;
     229             :         }
     230             : 
     231          26 :         if (restarted)
     232           0 :                 return 0;
     233             : 
     234             :         /* Initiate a new offline. */
     235          26 :         f->offline_state = OFFLINE_SYNCING;
     236             : 
     237          26 :         if (wait) /* Without using a thread if waiting. */
     238          26 :                 journal_file_set_offline_internal(f);
     239             :         else {
     240             :                 sigset_t ss, saved_ss;
     241             :                 int k;
     242             : 
     243           0 :                 assert_se(sigfillset(&ss) >= 0);
     244             :                 /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
     245             :                  * Asynchronous SIGBUS signals can safely be handled by either thread. */
     246           0 :                 assert_se(sigdelset(&ss, SIGBUS) >= 0);
     247             : 
     248           0 :                 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
     249           0 :                 if (r > 0)
     250           0 :                         return -r;
     251             : 
     252           0 :                 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
     253             : 
     254           0 :                 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
     255           0 :                 if (r > 0) {
     256           0 :                         f->offline_state = OFFLINE_JOINED;
     257           0 :                         return -r;
     258             :                 }
     259           0 :                 if (k > 0)
     260           0 :                         return -k;
     261             :         }
     262             : 
     263          26 :         return 0;
     264             : }
     265             : 
     266       51131 : static int journal_file_set_online(JournalFile *f) {
     267       51131 :         bool wait = true;
     268             : 
     269       51131 :         assert(f);
     270             : 
     271       51131 :         if (!f->writable)
     272           0 :                 return -EPERM;
     273             : 
     274       51131 :         if (f->fd < 0 || !f->header)
     275           0 :                 return -EINVAL;
     276             : 
     277      102262 :         while (wait) {
     278       51131 :                 switch (f->offline_state) {
     279       51131 :                 case OFFLINE_JOINED:
     280             :                         /* No offline thread, no need to wait. */
     281       51131 :                         wait = false;
     282       51131 :                         break;
     283             : 
     284           0 :                 case OFFLINE_SYNCING:
     285           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
     286           0 :                                 continue;
     287             :                         /* Canceled syncing prior to offlining, no need to wait. */
     288           0 :                         wait = false;
     289           0 :                         break;
     290             : 
     291           0 :                 case OFFLINE_AGAIN_FROM_SYNCING:
     292           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
     293           0 :                                 continue;
     294             :                         /* Canceled restart from syncing, no need to wait. */
     295           0 :                         wait = false;
     296           0 :                         break;
     297             : 
     298           0 :                 case OFFLINE_AGAIN_FROM_OFFLINING:
     299           0 :                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
     300           0 :                                 continue;
     301             :                         /* Canceled restart from offlining, must wait for offlining to complete however. */
     302             :                         _fallthrough_;
     303             :                 default: {
     304             :                         int r;
     305             : 
     306           0 :                         r = journal_file_set_offline_thread_join(f);
     307           0 :                         if (r < 0)
     308           0 :                                 return r;
     309             : 
     310           0 :                         wait = false;
     311           0 :                         break;
     312             :                 }
     313             :                 }
     314             :         }
     315             : 
     316       51131 :         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
     317           0 :                 return -EIO;
     318             : 
     319       51131 :         switch (f->header->state) {
     320       51105 :                 case STATE_ONLINE:
     321       51105 :                         return 0;
     322             : 
     323          26 :                 case STATE_OFFLINE:
     324          26 :                         f->header->state = STATE_ONLINE;
     325          26 :                         (void) fsync(f->fd);
     326          26 :                         return 0;
     327             : 
     328           0 :                 default:
     329           0 :                         return -EINVAL;
     330             :         }
     331             : }
     332             : 
     333          26 : bool journal_file_is_offlining(JournalFile *f) {
     334          26 :         assert(f);
     335             : 
     336          26 :         __sync_synchronize();
     337             : 
     338          26 :         if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
     339          26 :                 return false;
     340             : 
     341           0 :         return true;
     342             : }
     343             : 
     344        9843 : JournalFile* journal_file_close(JournalFile *f) {
     345        9843 :         if (!f)
     346           0 :                 return NULL;
     347             : 
     348             : #if HAVE_GCRYPT
     349             :         /* Write the final tag */
     350        9843 :         if (f->seal && f->writable) {
     351             :                 int r;
     352             : 
     353           0 :                 r = journal_file_append_tag(f);
     354           0 :                 if (r < 0)
     355           0 :                         log_error_errno(r, "Failed to append tag when closing journal: %m");
     356             :         }
     357             : #endif
     358             : 
     359        9843 :         if (f->post_change_timer) {
     360           0 :                 if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
     361           0 :                         journal_file_post_change(f);
     362             : 
     363           0 :                 sd_event_source_disable_unref(f->post_change_timer);
     364             :         }
     365             : 
     366        9843 :         journal_file_set_offline(f, true);
     367             : 
     368        9843 :         if (f->mmap && f->cache_fd)
     369        9843 :                 mmap_cache_free_fd(f->mmap, f->cache_fd);
     370             : 
     371        9843 :         if (f->fd >= 0 && f->defrag_on_close) {
     372             : 
     373             :                 /* Be friendly to btrfs: turn COW back on again now,
     374             :                  * and defragment the file. We won't write to the file
     375             :                  * ever again, hence remove all fragmentation, and
     376             :                  * reenable all the good bits COW usually provides
     377             :                  * (such as data checksumming). */
     378             : 
     379           2 :                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
     380           2 :                 (void) btrfs_defrag_fd(f->fd);
     381             :         }
     382             : 
     383        9843 :         if (f->close_fd)
     384        9843 :                 safe_close(f->fd);
     385        9843 :         free(f->path);
     386             : 
     387        9843 :         mmap_cache_unref(f->mmap);
     388             : 
     389        9843 :         ordered_hashmap_free_free(f->chain_cache);
     390             : 
     391             : #if HAVE_XZ || HAVE_LZ4
     392        9843 :         free(f->compress_buffer);
     393             : #endif
     394             : 
     395             : #if HAVE_GCRYPT
     396        9843 :         if (f->fss_file)
     397           0 :                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
     398             :         else
     399        9843 :                 free(f->fsprg_state);
     400             : 
     401        9843 :         free(f->fsprg_seed);
     402             : 
     403        9843 :         if (f->hmac)
     404           0 :                 gcry_md_close(f->hmac);
     405             : #endif
     406             : 
     407        9843 :         return mfree(f);
     408             : }
     409             : 
     410          25 : static int journal_file_init_header(JournalFile *f, JournalFile *template) {
     411          25 :         Header h = {};
     412             :         ssize_t k;
     413             :         int r;
     414             : 
     415          25 :         assert(f);
     416             : 
     417          25 :         memcpy(h.signature, HEADER_SIGNATURE, 8);
     418          25 :         h.header_size = htole64(ALIGN64(sizeof(h)));
     419             : 
     420          50 :         h.incompatible_flags |= htole32(
     421          50 :                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
     422          25 :                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
     423             : 
     424          50 :         h.compatible_flags = htole32(
     425          25 :                 f->seal * HEADER_COMPATIBLE_SEALED);
     426             : 
     427          25 :         r = sd_id128_randomize(&h.file_id);
     428          25 :         if (r < 0)
     429           0 :                 return r;
     430             : 
     431          25 :         if (template) {
     432           3 :                 h.seqnum_id = template->header->seqnum_id;
     433           3 :                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
     434             :         } else
     435          22 :                 h.seqnum_id = h.file_id;
     436             : 
     437          25 :         k = pwrite(f->fd, &h, sizeof(h), 0);
     438          25 :         if (k < 0)
     439           0 :                 return -errno;
     440             : 
     441          25 :         if (k != sizeof(h))
     442           0 :                 return -EIO;
     443             : 
     444          25 :         return 0;
     445             : }
     446             : 
     447          26 : static int journal_file_refresh_header(JournalFile *f) {
     448             :         sd_id128_t boot_id;
     449             :         int r;
     450             : 
     451          26 :         assert(f);
     452          26 :         assert(f->header);
     453             : 
     454          26 :         r = sd_id128_get_machine(&f->header->machine_id);
     455          26 :         if (IN_SET(r, -ENOENT, -ENOMEDIUM))
     456             :                 /* We don't have a machine-id, let's continue without */
     457           0 :                 zero(f->header->machine_id);
     458          26 :         else if (r < 0)
     459           0 :                 return r;
     460             : 
     461          26 :         r = sd_id128_get_boot(&boot_id);
     462          26 :         if (r < 0)
     463           0 :                 return r;
     464             : 
     465          26 :         f->header->boot_id = boot_id;
     466             : 
     467          26 :         r = journal_file_set_online(f);
     468             : 
     469             :         /* Sync the online state to disk */
     470          26 :         (void) fsync(f->fd);
     471             : 
     472             :         /* We likely just created a new file, also sync the directory this file is located in. */
     473          26 :         (void) fsync_directory_of_file(f->fd);
     474             : 
     475          26 :         return r;
     476             : }
     477             : 
     478        9819 : static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
     479        9819 :         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
     480        9819 :                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
     481        9819 :         const char *type = compatible ? "compatible" : "incompatible";
     482             :         uint32_t flags;
     483             : 
     484        9819 :         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
     485             : 
     486        9819 :         if (flags & ~supported) {
     487           0 :                 if (flags & ~any)
     488           0 :                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
     489             :                                   f->path, type, flags & ~any);
     490           0 :                 flags = (flags & any) & ~supported;
     491           0 :                 if (flags) {
     492             :                         const char* strv[3];
     493           0 :                         unsigned n = 0;
     494           0 :                         _cleanup_free_ char *t = NULL;
     495             : 
     496           0 :                         if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
     497           0 :                                 strv[n++] = "sealed";
     498           0 :                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
     499           0 :                                 strv[n++] = "xz-compressed";
     500           0 :                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
     501           0 :                                 strv[n++] = "lz4-compressed";
     502           0 :                         strv[n] = NULL;
     503           0 :                         assert(n < ELEMENTSOF(strv));
     504             : 
     505           0 :                         t = strv_join((char**) strv, ", ");
     506           0 :                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
     507             :                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
     508             :                 }
     509           0 :                 return true;
     510             :         }
     511             : 
     512        9819 :         return false;
     513             : }
     514             : 
     515        9818 : static int journal_file_verify_header(JournalFile *f) {
     516             :         uint64_t arena_size, header_size;
     517             : 
     518        9818 :         assert(f);
     519        9818 :         assert(f->header);
     520             : 
     521        9818 :         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
     522           0 :                 return -EBADMSG;
     523             : 
     524             :         /* In both read and write mode we refuse to open files with incompatible
     525             :          * flags we don't know. */
     526        9818 :         if (warn_wrong_flags(f, false))
     527           0 :                 return -EPROTONOSUPPORT;
     528             : 
     529             :         /* When open for writing we refuse to open files with compatible flags, too. */
     530        9818 :         if (f->writable && warn_wrong_flags(f, true))
     531           0 :                 return -EPROTONOSUPPORT;
     532             : 
     533        9818 :         if (f->header->state >= _STATE_MAX)
     534           0 :                 return -EBADMSG;
     535             : 
     536        9818 :         header_size = le64toh(f->header->header_size);
     537             : 
     538             :         /* The first addition was n_data, so check that we are at least this large */
     539        9818 :         if (header_size < HEADER_SIZE_MIN)
     540           0 :                 return -EBADMSG;
     541             : 
     542        9818 :         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
     543           0 :                 return -EBADMSG;
     544             : 
     545        9818 :         arena_size = le64toh(f->header->arena_size);
     546             : 
     547        9818 :         if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
     548           0 :                 return -ENODATA;
     549             : 
     550        9818 :         if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
     551           0 :                 return -ENODATA;
     552             : 
     553        9818 :         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
     554        9818 :             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
     555        9818 :             !VALID64(le64toh(f->header->tail_object_offset)) ||
     556        9818 :             !VALID64(le64toh(f->header->entry_array_offset)))
     557           0 :                 return -ENODATA;
     558             : 
     559        9818 :         if (f->writable) {
     560             :                 sd_id128_t machine_id;
     561             :                 uint8_t state;
     562             :                 int r;
     563             : 
     564           1 :                 r = sd_id128_get_machine(&machine_id);
     565           1 :                 if (r < 0)
     566           0 :                         return r;
     567             : 
     568           1 :                 if (!sd_id128_equal(machine_id, f->header->machine_id))
     569           0 :                         return -EHOSTDOWN;
     570             : 
     571           1 :                 state = f->header->state;
     572             : 
     573           1 :                 if (state == STATE_ARCHIVED)
     574           0 :                         return -ESHUTDOWN; /* Already archived */
     575           1 :                 else if (state == STATE_ONLINE)
     576           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
     577             :                                                "Journal file %s is already online. Assuming unclean closing.",
     578             :                                                f->path);
     579           1 :                 else if (state != STATE_OFFLINE)
     580           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
     581             :                                                "Journal file %s has unknown state %i.",
     582             :                                                f->path, state);
     583             : 
     584           1 :                 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
     585           0 :                         return -EBADMSG;
     586             : 
     587             :                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
     588             :                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
     589             :                  * bisection. */
     590           1 :                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
     591           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
     592             :                                                "Journal file %s is from the future, refusing to append new data to it that'd be older.",
     593             :                                                f->path);
     594             :         }
     595             : 
     596        9818 :         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
     597        9818 :         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
     598             : 
     599        9818 :         f->seal = JOURNAL_HEADER_SEALED(f->header);
     600             : 
     601        9818 :         return 0;
     602             : }
     603             : 
     604        9894 : static int journal_file_fstat(JournalFile *f) {
     605             :         int r;
     606             : 
     607        9894 :         assert(f);
     608        9894 :         assert(f->fd >= 0);
     609             : 
     610        9894 :         if (fstat(f->fd, &f->last_stat) < 0)
     611           0 :                 return -errno;
     612             : 
     613        9894 :         f->last_stat_usec = now(CLOCK_MONOTONIC);
     614             : 
     615             :         /* Refuse dealing with with files that aren't regular */
     616        9894 :         r = stat_verify_regular(&f->last_stat);
     617        9894 :         if (r < 0)
     618           0 :                 return r;
     619             : 
     620             :         /* Refuse appending to files that are already deleted */
     621        9894 :         if (f->last_stat.st_nlink <= 0)
     622           0 :                 return -EIDRM;
     623             : 
     624        9894 :         return 0;
     625             : }
     626             : 
     627       51105 : static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
     628             :         uint64_t old_size, new_size;
     629             :         int r;
     630             : 
     631       51105 :         assert(f);
     632       51105 :         assert(f->header);
     633             : 
     634             :         /* We assume that this file is not sparse, and we know that
     635             :          * for sure, since we always call posix_fallocate()
     636             :          * ourselves */
     637             : 
     638       51105 :         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
     639           0 :                 return -EIO;
     640             : 
     641       51105 :         old_size =
     642       51105 :                 le64toh(f->header->header_size) +
     643       51105 :                 le64toh(f->header->arena_size);
     644             : 
     645       51105 :         new_size = PAGE_ALIGN(offset + size);
     646       51105 :         if (new_size < le64toh(f->header->header_size))
     647           0 :                 new_size = le64toh(f->header->header_size);
     648             : 
     649       51105 :         if (new_size <= old_size) {
     650             : 
     651             :                 /* We already pre-allocated enough space, but before
     652             :                  * we write to it, let's check with fstat() if the
     653             :                  * file got deleted, in order make sure we don't throw
     654             :                  * away the data immediately. Don't check fstat() for
     655             :                  * all writes though, but only once ever 10s. */
     656             : 
     657       51079 :                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
     658       51079 :                         return 0;
     659             : 
     660           0 :                 return journal_file_fstat(f);
     661             :         }
     662             : 
     663             :         /* Allocate more space. */
     664             : 
     665          26 :         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
     666           0 :                 return -E2BIG;
     667             : 
     668          26 :         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
     669             :                 struct statvfs svfs;
     670             : 
     671           0 :                 if (fstatvfs(f->fd, &svfs) >= 0) {
     672             :                         uint64_t available;
     673             : 
     674           0 :                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
     675             : 
     676           0 :                         if (new_size - old_size > available)
     677           0 :                                 return -E2BIG;
     678             :                 }
     679             :         }
     680             : 
     681             :         /* Increase by larger blocks at once */
     682          26 :         new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
     683          26 :         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
     684           0 :                 new_size = f->metrics.max_size;
     685             : 
     686             :         /* Note that the glibc fallocate() fallback is very
     687             :            inefficient, hence we try to minimize the allocation area
     688             :            as we can. */
     689          26 :         r = posix_fallocate(f->fd, old_size, new_size - old_size);
     690          26 :         if (r != 0)
     691           0 :                 return -r;
     692             : 
     693          26 :         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
     694             : 
     695          26 :         return journal_file_fstat(f);
     696             : }
     697             : 
     698     2798797 : static unsigned type_to_context(ObjectType type) {
     699             :         /* One context for each type, plus one catch-all for the rest */
     700             :         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
     701             :         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
     702     2798797 :         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
     703             : }
     704             : 
     705     2798797 : static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
     706             :         int r;
     707             : 
     708     2798797 :         assert(f);
     709     2798797 :         assert(ret);
     710             : 
     711     2798797 :         if (size <= 0)
     712           0 :                 return -EINVAL;
     713             : 
     714             :         /* Avoid SIGBUS on invalid accesses */
     715     2798797 :         if (offset + size > (uint64_t) f->last_stat.st_size) {
     716             :                 /* Hmm, out of range? Let's refresh the fstat() data
     717             :                  * first, before we trust that check. */
     718             : 
     719           0 :                 r = journal_file_fstat(f);
     720           0 :                 if (r < 0)
     721           0 :                         return r;
     722             : 
     723           0 :                 if (offset + size > (uint64_t) f->last_stat.st_size)
     724           0 :                         return -EADDRNOTAVAIL;
     725             :         }
     726             : 
     727     2798797 :         return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
     728             : }
     729             : 
     730     2747114 : static uint64_t minimum_header_size(Object *o) {
     731             : 
     732             :         static const uint64_t table[] = {
     733             :                 [OBJECT_DATA] = sizeof(DataObject),
     734             :                 [OBJECT_FIELD] = sizeof(FieldObject),
     735             :                 [OBJECT_ENTRY] = sizeof(EntryObject),
     736             :                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
     737             :                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
     738             :                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
     739             :                 [OBJECT_TAG] = sizeof(TagObject),
     740             :         };
     741             : 
     742     2747114 :         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
     743           0 :                 return sizeof(ObjectHeader);
     744             : 
     745     2747114 :         return table[o->object.type];
     746             : }
     747             : 
     748             : /* Lightweight object checks. We want this to be fast, so that we won't
     749             :  * slowdown every journal_file_move_to_object() call too much. */
     750     2747114 : static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
     751     2747114 :         assert(f);
     752     2747114 :         assert(o);
     753             : 
     754     2747114 :         switch (o->object.type) {
     755             : 
     756     1041423 :         case OBJECT_DATA: {
     757     1041423 :                 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
     758           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     759             :                                                "Bad n_entries: %" PRIu64 ": %" PRIu64,
     760             :                                                le64toh(o->data.n_entries),
     761             :                                                offset);
     762             : 
     763     1041423 :                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
     764           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     765             :                                                "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
     766             :                                                offsetof(DataObject, payload),
     767             :                                                le64toh(o->object.size),
     768             :                                                offset);
     769             : 
     770     1041423 :                 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
     771     1041423 :                     !VALID64(le64toh(o->data.next_field_offset)) ||
     772     1041423 :                     !VALID64(le64toh(o->data.entry_offset)) ||
     773     1041423 :                     !VALID64(le64toh(o->data.entry_array_offset)))
     774           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     775             :                                                "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
     776             :                                                le64toh(o->data.next_hash_offset),
     777             :                                                le64toh(o->data.next_field_offset),
     778             :                                                le64toh(o->data.entry_offset),
     779             :                                                le64toh(o->data.entry_array_offset),
     780             :                                                offset);
     781             : 
     782     1041423 :                 break;
     783             :         }
     784             : 
     785       29529 :         case OBJECT_FIELD:
     786       29529 :                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
     787           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     788             :                                                "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
     789             :                                                offsetof(FieldObject, payload),
     790             :                                                le64toh(o->object.size),
     791             :                                                offset);
     792             : 
     793       29529 :                 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
     794       29529 :                     !VALID64(le64toh(o->field.head_data_offset)))
     795           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     796             :                                                "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
     797             :                                                le64toh(o->field.next_hash_offset),
     798             :                                                le64toh(o->field.head_data_offset),
     799             :                                                offset);
     800       29529 :                 break;
     801             : 
     802      310676 :         case OBJECT_ENTRY:
     803      310676 :                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
     804           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     805             :                                                "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
     806             :                                                offsetof(EntryObject, items),
     807             :                                                le64toh(o->object.size),
     808             :                                                offset);
     809             : 
     810      310676 :                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
     811           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     812             :                                                "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
     813             :                                                (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
     814             :                                                offset);
     815             : 
     816      310676 :                 if (le64toh(o->entry.seqnum) <= 0)
     817           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     818             :                                                "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
     819             :                                                le64toh(o->entry.seqnum),
     820             :                                                offset);
     821             : 
     822      310676 :                 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
     823           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     824             :                                                "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
     825             :                                                le64toh(o->entry.realtime),
     826             :                                                offset);
     827             : 
     828      310676 :                 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
     829           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     830             :                                                "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
     831             :                                                le64toh(o->entry.monotonic),
     832             :                                                offset);
     833             : 
     834      310676 :                 break;
     835             : 
     836          78 :         case OBJECT_DATA_HASH_TABLE:
     837             :         case OBJECT_FIELD_HASH_TABLE:
     838          78 :                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
     839          78 :                     (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
     840           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     841             :                                                "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
     842             :                                                o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
     843             :                                                le64toh(o->object.size),
     844             :                                                offset);
     845             : 
     846          78 :                 break;
     847             : 
     848     1365408 :         case OBJECT_ENTRY_ARRAY:
     849     1365408 :                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
     850     1365408 :                     (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
     851           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     852             :                                                "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
     853             :                                                le64toh(o->object.size),
     854             :                                                offset);
     855             : 
     856     1365408 :                 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
     857           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     858             :                                                "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
     859             :                                                le64toh(o->entry_array.next_entry_array_offset),
     860             :                                                offset);
     861             : 
     862     1365408 :                 break;
     863             : 
     864           0 :         case OBJECT_TAG:
     865           0 :                 if (le64toh(o->object.size) != sizeof(TagObject))
     866           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     867             :                                                "Invalid object tag size: %" PRIu64 ": %" PRIu64,
     868             :                                                le64toh(o->object.size),
     869             :                                                offset);
     870             : 
     871           0 :                 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
     872           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     873             :                                                "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
     874             :                                                le64toh(o->tag.epoch), offset);
     875             : 
     876           0 :                 break;
     877             :         }
     878             : 
     879     2747114 :         return 0;
     880             : }
     881             : 
     882     2747552 : int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
     883             :         int r;
     884             :         void *t;
     885             :         size_t tsize;
     886             :         Object *o;
     887             :         uint64_t s;
     888             : 
     889     2747552 :         assert(f);
     890     2747552 :         assert(ret);
     891             : 
     892             :         /* Objects may only be located at multiple of 64 bit */
     893     2747552 :         if (!VALID64(offset))
     894           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     895             :                                        "Attempt to move to object at non-64bit boundary: %" PRIu64,
     896             :                                        offset);
     897             : 
     898             :         /* Object may not be located in the file header */
     899     2747552 :         if (offset < le64toh(f->header->header_size))
     900           3 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     901             :                                        "Attempt to move to object located in file header: %" PRIu64,
     902             :                                        offset);
     903             : 
     904     2747549 :         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
     905     2747549 :         if (r < 0)
     906           0 :                 return r;
     907             : 
     908     2747549 :         o = (Object*) t;
     909     2747549 :         s = le64toh(o->object.size);
     910             : 
     911     2747549 :         if (s == 0)
     912         435 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     913             :                                        "Attempt to move to uninitialized object: %" PRIu64,
     914             :                                        offset);
     915     2747114 :         if (s < sizeof(ObjectHeader))
     916           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     917             :                                        "Attempt to move to overly short object: %" PRIu64,
     918             :                                        offset);
     919             : 
     920     2747114 :         if (o->object.type <= OBJECT_UNUSED)
     921           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     922             :                                        "Attempt to move to object with invalid type: %" PRIu64,
     923             :                                        offset);
     924             : 
     925     2747114 :         if (s < minimum_header_size(o))
     926           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     927             :                                        "Attempt to move to truncated object: %" PRIu64,
     928             :                                        offset);
     929             : 
     930     2747114 :         if (type > OBJECT_UNUSED && o->object.type != type)
     931           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
     932             :                                        "Attempt to move to object of unexpected type: %" PRIu64,
     933             :                                        offset);
     934             : 
     935     2747114 :         if (s > tsize) {
     936           4 :                 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
     937           4 :                 if (r < 0)
     938           0 :                         return r;
     939             : 
     940           4 :                 o = (Object*) t;
     941             :         }
     942             : 
     943     2747114 :         r = journal_file_check_object(f, offset, o);
     944     2747114 :         if (r < 0)
     945           0 :                 return r;
     946             : 
     947     2747114 :         *ret = o;
     948     2747114 :         return 0;
     949             : }
     950             : 
     951       16285 : static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
     952             :         uint64_t r;
     953             : 
     954       16285 :         assert(f);
     955       16285 :         assert(f->header);
     956             : 
     957       16285 :         r = le64toh(f->header->tail_entry_seqnum) + 1;
     958             : 
     959       16285 :         if (seqnum) {
     960             :                 /* If an external seqnum counter was passed, we update
     961             :                  * both the local and the external one, and set it to
     962             :                  * the maximum of both */
     963             : 
     964           7 :                 if (*seqnum + 1 > r)
     965           1 :                         r = *seqnum + 1;
     966             : 
     967           7 :                 *seqnum = r;
     968             :         }
     969             : 
     970       16285 :         f->header->tail_entry_seqnum = htole64(r);
     971             : 
     972       16285 :         if (f->header->head_entry_seqnum == 0)
     973          19 :                 f->header->head_entry_seqnum = htole64(r);
     974             : 
     975       16285 :         return r;
     976             : }
     977             : 
     978       51105 : int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
     979             :         int r;
     980             :         uint64_t p;
     981             :         Object *tail, *o;
     982             :         void *t;
     983             : 
     984       51105 :         assert(f);
     985       51105 :         assert(f->header);
     986       51105 :         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
     987       51105 :         assert(size >= sizeof(ObjectHeader));
     988       51105 :         assert(offset);
     989       51105 :         assert(ret);
     990             : 
     991       51105 :         r = journal_file_set_online(f);
     992       51105 :         if (r < 0)
     993           0 :                 return r;
     994             : 
     995       51105 :         p = le64toh(f->header->tail_object_offset);
     996       51105 :         if (p == 0)
     997          25 :                 p = le64toh(f->header->header_size);
     998             :         else {
     999       51080 :                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
    1000       51080 :                 if (r < 0)
    1001           0 :                         return r;
    1002             : 
    1003       51080 :                 p += ALIGN64(le64toh(tail->object.size));
    1004             :         }
    1005             : 
    1006       51105 :         r = journal_file_allocate(f, p, size);
    1007       51105 :         if (r < 0)
    1008           0 :                 return r;
    1009             : 
    1010       51105 :         r = journal_file_move_to(f, type, false, p, size, &t, NULL);
    1011       51105 :         if (r < 0)
    1012           0 :                 return r;
    1013             : 
    1014       51105 :         o = (Object*) t;
    1015             : 
    1016       51105 :         zero(o->object);
    1017       51105 :         o->object.type = type;
    1018       51105 :         o->object.size = htole64(size);
    1019             : 
    1020       51105 :         f->header->tail_object_offset = htole64(p);
    1021       51105 :         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
    1022             : 
    1023       51105 :         *ret = o;
    1024       51105 :         *offset = p;
    1025             : 
    1026       51105 :         return 0;
    1027             : }
    1028             : 
    1029          25 : static int journal_file_setup_data_hash_table(JournalFile *f) {
    1030             :         uint64_t s, p;
    1031             :         Object *o;
    1032             :         int r;
    1033             : 
    1034          25 :         assert(f);
    1035          25 :         assert(f->header);
    1036             : 
    1037             :         /* We estimate that we need 1 hash table entry per 768 bytes
    1038             :            of journal file and we want to make sure we never get
    1039             :            beyond 75% fill level. Calculate the hash table size for
    1040             :            the maximum file size based on these metrics. */
    1041             : 
    1042          25 :         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
    1043          25 :         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
    1044          25 :                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
    1045             : 
    1046          25 :         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
    1047             : 
    1048          25 :         r = journal_file_append_object(f,
    1049             :                                        OBJECT_DATA_HASH_TABLE,
    1050             :                                        offsetof(Object, hash_table.items) + s,
    1051             :                                        &o, &p);
    1052          25 :         if (r < 0)
    1053           0 :                 return r;
    1054             : 
    1055          25 :         memzero(o->hash_table.items, s);
    1056             : 
    1057          25 :         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
    1058          25 :         f->header->data_hash_table_size = htole64(s);
    1059             : 
    1060          25 :         return 0;
    1061             : }
    1062             : 
    1063          25 : static int journal_file_setup_field_hash_table(JournalFile *f) {
    1064             :         uint64_t s, p;
    1065             :         Object *o;
    1066             :         int r;
    1067             : 
    1068          25 :         assert(f);
    1069          25 :         assert(f->header);
    1070             : 
    1071             :         /* We use a fixed size hash table for the fields as this
    1072             :          * number should grow very slowly only */
    1073             : 
    1074          25 :         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
    1075          25 :         r = journal_file_append_object(f,
    1076             :                                        OBJECT_FIELD_HASH_TABLE,
    1077             :                                        offsetof(Object, hash_table.items) + s,
    1078             :                                        &o, &p);
    1079          25 :         if (r < 0)
    1080           0 :                 return r;
    1081             : 
    1082          25 :         memzero(o->hash_table.items, s);
    1083             : 
    1084          25 :         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
    1085          25 :         f->header->field_hash_table_size = htole64(s);
    1086             : 
    1087          25 :         return 0;
    1088             : }
    1089             : 
    1090      238922 : int journal_file_map_data_hash_table(JournalFile *f) {
    1091             :         uint64_t s, p;
    1092             :         void *t;
    1093             :         int r;
    1094             : 
    1095      238922 :         assert(f);
    1096      238922 :         assert(f->header);
    1097             : 
    1098      238922 :         if (f->data_hash_table)
    1099      238799 :                 return 0;
    1100             : 
    1101         123 :         p = le64toh(f->header->data_hash_table_offset);
    1102         123 :         s = le64toh(f->header->data_hash_table_size);
    1103             : 
    1104         123 :         r = journal_file_move_to(f,
    1105             :                                  OBJECT_DATA_HASH_TABLE,
    1106             :                                  true,
    1107             :                                  p, s,
    1108             :                                  &t, NULL);
    1109         123 :         if (r < 0)
    1110           0 :                 return r;
    1111             : 
    1112         123 :         f->data_hash_table = t;
    1113         123 :         return 0;
    1114             : }
    1115             : 
    1116       28284 : int journal_file_map_field_hash_table(JournalFile *f) {
    1117             :         uint64_t s, p;
    1118             :         void *t;
    1119             :         int r;
    1120             : 
    1121       28284 :         assert(f);
    1122       28284 :         assert(f->header);
    1123             : 
    1124       28284 :         if (f->field_hash_table)
    1125       28268 :                 return 0;
    1126             : 
    1127          16 :         p = le64toh(f->header->field_hash_table_offset);
    1128          16 :         s = le64toh(f->header->field_hash_table_size);
    1129             : 
    1130          16 :         r = journal_file_move_to(f,
    1131             :                                  OBJECT_FIELD_HASH_TABLE,
    1132             :                                  true,
    1133             :                                  p, s,
    1134             :                                  &t, NULL);
    1135          16 :         if (r < 0)
    1136           0 :                 return r;
    1137             : 
    1138          16 :         f->field_hash_table = t;
    1139          16 :         return 0;
    1140             : }
    1141             : 
    1142         112 : static int journal_file_link_field(
    1143             :                 JournalFile *f,
    1144             :                 Object *o,
    1145             :                 uint64_t offset,
    1146             :                 uint64_t hash) {
    1147             : 
    1148             :         uint64_t p, h, m;
    1149             :         int r;
    1150             : 
    1151         112 :         assert(f);
    1152         112 :         assert(f->header);
    1153         112 :         assert(f->field_hash_table);
    1154         112 :         assert(o);
    1155         112 :         assert(offset > 0);
    1156             : 
    1157         112 :         if (o->object.type != OBJECT_FIELD)
    1158           0 :                 return -EINVAL;
    1159             : 
    1160         112 :         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
    1161         112 :         if (m <= 0)
    1162           0 :                 return -EBADMSG;
    1163             : 
    1164             :         /* This might alter the window we are looking at */
    1165         112 :         o->field.next_hash_offset = o->field.head_data_offset = 0;
    1166             : 
    1167         112 :         h = hash % m;
    1168         112 :         p = le64toh(f->field_hash_table[h].tail_hash_offset);
    1169         112 :         if (p == 0)
    1170          96 :                 f->field_hash_table[h].head_hash_offset = htole64(offset);
    1171             :         else {
    1172          16 :                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
    1173          16 :                 if (r < 0)
    1174           0 :                         return r;
    1175             : 
    1176          16 :                 o->field.next_hash_offset = htole64(offset);
    1177             :         }
    1178             : 
    1179         112 :         f->field_hash_table[h].tail_hash_offset = htole64(offset);
    1180             : 
    1181         112 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
    1182         112 :                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
    1183             : 
    1184         112 :         return 0;
    1185             : }
    1186             : 
    1187       28288 : static int journal_file_link_data(
    1188             :                 JournalFile *f,
    1189             :                 Object *o,
    1190             :                 uint64_t offset,
    1191             :                 uint64_t hash) {
    1192             : 
    1193             :         uint64_t p, h, m;
    1194             :         int r;
    1195             : 
    1196       28288 :         assert(f);
    1197       28288 :         assert(f->header);
    1198       28288 :         assert(f->data_hash_table);
    1199       28288 :         assert(o);
    1200       28288 :         assert(offset > 0);
    1201             : 
    1202       28288 :         if (o->object.type != OBJECT_DATA)
    1203           0 :                 return -EINVAL;
    1204             : 
    1205       28288 :         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
    1206       28288 :         if (m <= 0)
    1207           0 :                 return -EBADMSG;
    1208             : 
    1209             :         /* This might alter the window we are looking at */
    1210       28288 :         o->data.next_hash_offset = o->data.next_field_offset = 0;
    1211       28288 :         o->data.entry_offset = o->data.entry_array_offset = 0;
    1212       28288 :         o->data.n_entries = 0;
    1213             : 
    1214       28288 :         h = hash % m;
    1215       28288 :         p = le64toh(f->data_hash_table[h].tail_hash_offset);
    1216       28288 :         if (p == 0)
    1217             :                 /* Only entry in the hash table is easy */
    1218        2399 :                 f->data_hash_table[h].head_hash_offset = htole64(offset);
    1219             :         else {
    1220             :                 /* Move back to the previous data object, to patch in
    1221             :                  * pointer */
    1222             : 
    1223       25889 :                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
    1224       25889 :                 if (r < 0)
    1225           0 :                         return r;
    1226             : 
    1227       25889 :                 o->data.next_hash_offset = htole64(offset);
    1228             :         }
    1229             : 
    1230       28288 :         f->data_hash_table[h].tail_hash_offset = htole64(offset);
    1231             : 
    1232       28288 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
    1233       28288 :                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
    1234             : 
    1235       28288 :         return 0;
    1236             : }
    1237             : 
    1238       28284 : int journal_file_find_field_object_with_hash(
    1239             :                 JournalFile *f,
    1240             :                 const void *field, uint64_t size, uint64_t hash,
    1241             :                 Object **ret, uint64_t *offset) {
    1242             : 
    1243             :         uint64_t p, osize, h, m;
    1244             :         int r;
    1245             : 
    1246       28284 :         assert(f);
    1247       28284 :         assert(f->header);
    1248       28284 :         assert(field && size > 0);
    1249             : 
    1250             :         /* If the field hash table is empty, we can't find anything */
    1251       28284 :         if (le64toh(f->header->field_hash_table_size) <= 0)
    1252           0 :                 return 0;
    1253             : 
    1254             :         /* Map the field hash table, if it isn't mapped yet. */
    1255       28284 :         r = journal_file_map_field_hash_table(f);
    1256       28284 :         if (r < 0)
    1257           0 :                 return r;
    1258             : 
    1259       28284 :         osize = offsetof(Object, field.payload) + size;
    1260             : 
    1261       28284 :         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
    1262       28284 :         if (m <= 0)
    1263           0 :                 return -EBADMSG;
    1264             : 
    1265       28284 :         h = hash % m;
    1266       28284 :         p = le64toh(f->field_hash_table[h].head_hash_offset);
    1267             : 
    1268       29397 :         while (p > 0) {
    1269             :                 Object *o;
    1270             : 
    1271       29285 :                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
    1272       29285 :                 if (r < 0)
    1273       28172 :                         return r;
    1274             : 
    1275       57457 :                 if (le64toh(o->field.hash) == hash &&
    1276       28172 :                     le64toh(o->object.size) == osize &&
    1277       28172 :                     memcmp(o->field.payload, field, size) == 0) {
    1278             : 
    1279       28172 :                         if (ret)
    1280       28172 :                                 *ret = o;
    1281       28172 :                         if (offset)
    1282       28169 :                                 *offset = p;
    1283             : 
    1284       28172 :                         return 1;
    1285             :                 }
    1286             : 
    1287        1113 :                 p = le64toh(o->field.next_hash_offset);
    1288             :         }
    1289             : 
    1290         112 :         return 0;
    1291             : }
    1292             : 
    1293           3 : int journal_file_find_field_object(
    1294             :                 JournalFile *f,
    1295             :                 const void *field, uint64_t size,
    1296             :                 Object **ret, uint64_t *offset) {
    1297             : 
    1298             :         uint64_t hash;
    1299             : 
    1300           3 :         assert(f);
    1301           3 :         assert(field && size > 0);
    1302             : 
    1303           3 :         hash = hash64(field, size);
    1304             : 
    1305           3 :         return journal_file_find_field_object_with_hash(f,
    1306             :                                                         field, size, hash,
    1307             :                                                         ret, offset);
    1308             : }
    1309             : 
    1310      232921 : int journal_file_find_data_object_with_hash(
    1311             :                 JournalFile *f,
    1312             :                 const void *data, uint64_t size, uint64_t hash,
    1313             :                 Object **ret, uint64_t *offset) {
    1314             : 
    1315             :         uint64_t p, osize, h, m;
    1316             :         int r;
    1317             : 
    1318      232921 :         assert(f);
    1319      232921 :         assert(f->header);
    1320      232921 :         assert(data || size == 0);
    1321             : 
    1322             :         /* If there's no data hash table, then there's no entry. */
    1323      232921 :         if (le64toh(f->header->data_hash_table_size) <= 0)
    1324           0 :                 return 0;
    1325             : 
    1326             :         /* Map the data hash table, if it isn't mapped yet. */
    1327      232921 :         r = journal_file_map_data_hash_table(f);
    1328      232921 :         if (r < 0)
    1329           0 :                 return r;
    1330             : 
    1331      232921 :         osize = offsetof(Object, data.payload) + size;
    1332             : 
    1333      232921 :         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
    1334      232921 :         if (m <= 0)
    1335           0 :                 return -EBADMSG;
    1336             : 
    1337      232921 :         h = hash % m;
    1338      232921 :         p = le64toh(f->data_hash_table[h].head_hash_offset);
    1339             : 
    1340      521460 :         while (p > 0) {
    1341             :                 Object *o;
    1342             : 
    1343      492867 :                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
    1344      492867 :                 if (r < 0)
    1345      204328 :                         return r;
    1346             : 
    1347      492867 :                 if (le64toh(o->data.hash) != hash)
    1348      288539 :                         goto next;
    1349             : 
    1350      204328 :                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
    1351             : #if HAVE_XZ || HAVE_LZ4
    1352             :                         uint64_t l;
    1353           0 :                         size_t rsize = 0;
    1354             : 
    1355           0 :                         l = le64toh(o->object.size);
    1356           0 :                         if (l <= offsetof(Object, data.payload))
    1357           0 :                                 return -EBADMSG;
    1358             : 
    1359           0 :                         l -= offsetof(Object, data.payload);
    1360             : 
    1361           0 :                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
    1362           0 :                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
    1363           0 :                         if (r < 0)
    1364           0 :                                 return r;
    1365             : 
    1366           0 :                         if (rsize == size &&
    1367           0 :                             memcmp(f->compress_buffer, data, size) == 0) {
    1368             : 
    1369           0 :                                 if (ret)
    1370           0 :                                         *ret = o;
    1371             : 
    1372           0 :                                 if (offset)
    1373           0 :                                         *offset = p;
    1374             : 
    1375           0 :                                 return 1;
    1376             :                         }
    1377             : #else
    1378             :                         return -EPROTONOSUPPORT;
    1379             : #endif
    1380      204328 :                 } else if (le64toh(o->object.size) == osize &&
    1381      204328 :                            memcmp(o->data.payload, data, size) == 0) {
    1382             : 
    1383      204328 :                         if (ret)
    1384      203725 :                                 *ret = o;
    1385             : 
    1386      204328 :                         if (offset)
    1387      204268 :                                 *offset = p;
    1388             : 
    1389      204328 :                         return 1;
    1390             :                 }
    1391             : 
    1392           0 :         next:
    1393      288539 :                 p = le64toh(o->data.next_hash_offset);
    1394             :         }
    1395             : 
    1396       28593 :         return 0;
    1397             : }
    1398             : 
    1399           7 : int journal_file_find_data_object(
    1400             :                 JournalFile *f,
    1401             :                 const void *data, uint64_t size,
    1402             :                 Object **ret, uint64_t *offset) {
    1403             : 
    1404             :         uint64_t hash;
    1405             : 
    1406           7 :         assert(f);
    1407           7 :         assert(data || size == 0);
    1408             : 
    1409           7 :         hash = hash64(data, size);
    1410             : 
    1411           7 :         return journal_file_find_data_object_with_hash(f,
    1412             :                                                        data, size, hash,
    1413             :                                                        ret, offset);
    1414             : }
    1415             : 
    1416       28281 : static int journal_file_append_field(
    1417             :                 JournalFile *f,
    1418             :                 const void *field, uint64_t size,
    1419             :                 Object **ret, uint64_t *offset) {
    1420             : 
    1421             :         uint64_t hash, p;
    1422             :         uint64_t osize;
    1423             :         Object *o;
    1424             :         int r;
    1425             : 
    1426       28281 :         assert(f);
    1427       28281 :         assert(field && size > 0);
    1428             : 
    1429       28281 :         hash = hash64(field, size);
    1430             : 
    1431       28281 :         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
    1432       28281 :         if (r < 0)
    1433           0 :                 return r;
    1434       28281 :         else if (r > 0) {
    1435             : 
    1436       28169 :                 if (ret)
    1437       28169 :                         *ret = o;
    1438             : 
    1439       28169 :                 if (offset)
    1440       28169 :                         *offset = p;
    1441             : 
    1442       28169 :                 return 0;
    1443             :         }
    1444             : 
    1445         112 :         osize = offsetof(Object, field.payload) + size;
    1446         112 :         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
    1447         112 :         if (r < 0)
    1448           0 :                 return r;
    1449             : 
    1450         112 :         o->field.hash = htole64(hash);
    1451         112 :         memcpy(o->field.payload, field, size);
    1452             : 
    1453         112 :         r = journal_file_link_field(f, o, p, hash);
    1454         112 :         if (r < 0)
    1455           0 :                 return r;
    1456             : 
    1457             :         /* The linking might have altered the window, so let's
    1458             :          * refresh our pointer */
    1459         112 :         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
    1460         112 :         if (r < 0)
    1461           0 :                 return r;
    1462             : 
    1463             : #if HAVE_GCRYPT
    1464         112 :         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
    1465         112 :         if (r < 0)
    1466           0 :                 return r;
    1467             : #endif
    1468             : 
    1469         112 :         if (ret)
    1470         112 :                 *ret = o;
    1471             : 
    1472         112 :         if (offset)
    1473         112 :                 *offset = p;
    1474             : 
    1475         112 :         return 0;
    1476             : }
    1477             : 
    1478      232013 : static int journal_file_append_data(
    1479             :                 JournalFile *f,
    1480             :                 const void *data, uint64_t size,
    1481             :                 Object **ret, uint64_t *offset) {
    1482             : 
    1483             :         uint64_t hash, p;
    1484             :         uint64_t osize;
    1485             :         Object *o;
    1486      232013 :         int r, compression = 0;
    1487             :         const void *eq;
    1488             : 
    1489      232013 :         assert(f);
    1490      232013 :         assert(data || size == 0);
    1491             : 
    1492      232013 :         hash = hash64(data, size);
    1493             : 
    1494      232013 :         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
    1495      232013 :         if (r < 0)
    1496           0 :                 return r;
    1497      232013 :         if (r > 0) {
    1498             : 
    1499      203725 :                 if (ret)
    1500      203725 :                         *ret = o;
    1501             : 
    1502      203725 :                 if (offset)
    1503      203725 :                         *offset = p;
    1504             : 
    1505      203725 :                 return 0;
    1506             :         }
    1507             : 
    1508       28288 :         osize = offsetof(Object, data.payload) + size;
    1509       28288 :         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
    1510       28288 :         if (r < 0)
    1511           0 :                 return r;
    1512             : 
    1513       28288 :         o->data.hash = htole64(hash);
    1514             : 
    1515             : #if HAVE_XZ || HAVE_LZ4
    1516       28288 :         if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
    1517           4 :                 size_t rsize = 0;
    1518             : 
    1519           4 :                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
    1520             : 
    1521           4 :                 if (compression >= 0) {
    1522           4 :                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
    1523           4 :                         o->object.flags |= compression;
    1524             : 
    1525           4 :                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
    1526             :                                   size, rsize, object_compressed_to_string(compression));
    1527             :                 } else
    1528             :                         /* Compression didn't work, we don't really care why, let's continue without compression */
    1529           0 :                         compression = 0;
    1530             :         }
    1531             : #endif
    1532             : 
    1533       28288 :         if (compression == 0)
    1534       28284 :                 memcpy_safe(o->data.payload, data, size);
    1535             : 
    1536       28288 :         r = journal_file_link_data(f, o, p, hash);
    1537       28288 :         if (r < 0)
    1538           0 :                 return r;
    1539             : 
    1540             : #if HAVE_GCRYPT
    1541       28288 :         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
    1542       28288 :         if (r < 0)
    1543           0 :                 return r;
    1544             : #endif
    1545             : 
    1546             :         /* The linking might have altered the window, so let's
    1547             :          * refresh our pointer */
    1548       28288 :         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
    1549       28288 :         if (r < 0)
    1550           0 :                 return r;
    1551             : 
    1552       28288 :         if (!data)
    1553           0 :                 eq = NULL;
    1554             :         else
    1555       28288 :                 eq = memchr(data, '=', size);
    1556       28288 :         if (eq && eq > data) {
    1557       28281 :                 Object *fo = NULL;
    1558             :                 uint64_t fp;
    1559             : 
    1560             :                 /* Create field object ... */
    1561       28281 :                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
    1562       28281 :                 if (r < 0)
    1563           0 :                         return r;
    1564             : 
    1565             :                 /* ... and link it in. */
    1566       28281 :                 o->data.next_field_offset = fo->field.head_data_offset;
    1567       28281 :                 fo->field.head_data_offset = le64toh(p);
    1568             :         }
    1569             : 
    1570       28288 :         if (ret)
    1571       28288 :                 *ret = o;
    1572             : 
    1573       28288 :         if (offset)
    1574       28288 :                 *offset = p;
    1575             : 
    1576       28288 :         return 0;
    1577             : }
    1578             : 
    1579       50891 : uint64_t journal_file_entry_n_items(Object *o) {
    1580       50891 :         assert(o);
    1581             : 
    1582       50891 :         if (o->object.type != OBJECT_ENTRY)
    1583           0 :                 return 0;
    1584             : 
    1585       50891 :         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
    1586             : }
    1587             : 
    1588     1362640 : uint64_t journal_file_entry_array_n_items(Object *o) {
    1589     1362640 :         assert(o);
    1590             : 
    1591     1362640 :         if (o->object.type != OBJECT_ENTRY_ARRAY)
    1592           0 :                 return 0;
    1593             : 
    1594     1362640 :         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
    1595             : }
    1596             : 
    1597        2382 : uint64_t journal_file_hash_table_n_items(Object *o) {
    1598        2382 :         assert(o);
    1599             : 
    1600        2382 :         if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
    1601           0 :                 return 0;
    1602             : 
    1603        2382 :         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
    1604             : }
    1605             : 
    1606      220010 : static int link_entry_into_array(JournalFile *f,
    1607             :                                  le64_t *first,
    1608             :                                  le64_t *idx,
    1609             :                                  uint64_t p) {
    1610             :         int r;
    1611      220010 :         uint64_t n = 0, ap = 0, q, i, a, hidx;
    1612             :         Object *o;
    1613             : 
    1614      220010 :         assert(f);
    1615      220010 :         assert(f->header);
    1616      220010 :         assert(first);
    1617      220010 :         assert(idx);
    1618      220010 :         assert(p > 0);
    1619             : 
    1620      220010 :         a = le64toh(*first);
    1621      220010 :         i = hidx = le64toh(*idx);
    1622     1284082 :         while (a > 0) {
    1623             : 
    1624     1277712 :                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
    1625     1277712 :                 if (r < 0)
    1626           0 :                         return r;
    1627             : 
    1628     1277712 :                 n = journal_file_entry_array_n_items(o);
    1629     1277712 :                 if (i < n) {
    1630      213640 :                         o->entry_array.items[i] = htole64(p);
    1631      213640 :                         *idx = htole64(hidx + 1);
    1632      213640 :                         return 0;
    1633             :                 }
    1634             : 
    1635     1064072 :                 i -= n;
    1636     1064072 :                 ap = a;
    1637     1064072 :                 a = le64toh(o->entry_array.next_entry_array_offset);
    1638             :         }
    1639             : 
    1640        6370 :         if (hidx > n)
    1641        1209 :                 n = (hidx+1) * 2;
    1642             :         else
    1643        5161 :                 n = n * 2;
    1644             : 
    1645        6370 :         if (n < 4)
    1646        3804 :                 n = 4;
    1647             : 
    1648        6370 :         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
    1649             :                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
    1650             :                                        &o, &q);
    1651        6370 :         if (r < 0)
    1652           0 :                 return r;
    1653             : 
    1654             : #if HAVE_GCRYPT
    1655        6370 :         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
    1656        6370 :         if (r < 0)
    1657           0 :                 return r;
    1658             : #endif
    1659             : 
    1660        6370 :         o->entry_array.items[i] = htole64(p);
    1661             : 
    1662        6370 :         if (ap == 0)
    1663        3804 :                 *first = htole64(q);
    1664             :         else {
    1665        2566 :                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
    1666        2566 :                 if (r < 0)
    1667           0 :                         return r;
    1668             : 
    1669        2566 :                 o->entry_array.next_entry_array_offset = htole64(q);
    1670             :         }
    1671             : 
    1672        6370 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
    1673        6370 :                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
    1674             : 
    1675        6370 :         *idx = htole64(hidx + 1);
    1676             : 
    1677        6370 :         return 0;
    1678             : }
    1679             : 
    1680      232013 : static int link_entry_into_array_plus_one(JournalFile *f,
    1681             :                                           le64_t *extra,
    1682             :                                           le64_t *first,
    1683             :                                           le64_t *idx,
    1684             :                                           uint64_t p) {
    1685             : 
    1686             :         int r;
    1687             : 
    1688      232013 :         assert(f);
    1689      232013 :         assert(extra);
    1690      232013 :         assert(first);
    1691      232013 :         assert(idx);
    1692      232013 :         assert(p > 0);
    1693             : 
    1694      232013 :         if (*idx == 0)
    1695       28288 :                 *extra = htole64(p);
    1696             :         else {
    1697             :                 le64_t i;
    1698             : 
    1699      203725 :                 i = htole64(le64toh(*idx) - 1);
    1700      203725 :                 r = link_entry_into_array(f, first, &i, p);
    1701      203725 :                 if (r < 0)
    1702           0 :                         return r;
    1703             :         }
    1704             : 
    1705      232013 :         *idx = htole64(le64toh(*idx) + 1);
    1706      232013 :         return 0;
    1707             : }
    1708             : 
    1709      232013 : static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
    1710             :         uint64_t p;
    1711             :         int r;
    1712      232013 :         assert(f);
    1713      232013 :         assert(o);
    1714      232013 :         assert(offset > 0);
    1715             : 
    1716      232013 :         p = le64toh(o->entry.items[i].object_offset);
    1717      232013 :         if (p == 0)
    1718           0 :                 return -EINVAL;
    1719             : 
    1720      232013 :         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
    1721      232013 :         if (r < 0)
    1722           0 :                 return r;
    1723             : 
    1724      696039 :         return link_entry_into_array_plus_one(f,
    1725      232013 :                                               &o->data.entry_offset,
    1726      232013 :                                               &o->data.entry_array_offset,
    1727      232013 :                                               &o->data.n_entries,
    1728             :                                               offset);
    1729             : }
    1730             : 
    1731       16285 : static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
    1732             :         uint64_t n, i;
    1733             :         int r;
    1734             : 
    1735       16285 :         assert(f);
    1736       16285 :         assert(f->header);
    1737       16285 :         assert(o);
    1738       16285 :         assert(offset > 0);
    1739             : 
    1740       16285 :         if (o->object.type != OBJECT_ENTRY)
    1741           0 :                 return -EINVAL;
    1742             : 
    1743       16285 :         __sync_synchronize();
    1744             : 
    1745             :         /* Link up the entry itself */
    1746       32570 :         r = link_entry_into_array(f,
    1747       16285 :                                   &f->header->entry_array_offset,
    1748       16285 :                                   &f->header->n_entries,
    1749             :                                   offset);
    1750       16285 :         if (r < 0)
    1751           0 :                 return r;
    1752             : 
    1753             :         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
    1754             : 
    1755       16285 :         if (f->header->head_entry_realtime == 0)
    1756          19 :                 f->header->head_entry_realtime = o->entry.realtime;
    1757             : 
    1758       16285 :         f->header->tail_entry_realtime = o->entry.realtime;
    1759       16285 :         f->header->tail_entry_monotonic = o->entry.monotonic;
    1760             : 
    1761             :         /* Link up the items */
    1762       16285 :         n = journal_file_entry_n_items(o);
    1763      248298 :         for (i = 0; i < n; i++) {
    1764      232013 :                 r = journal_file_link_entry_item(f, o, offset, i);
    1765      232013 :                 if (r < 0)
    1766           0 :                         return r;
    1767             :         }
    1768             : 
    1769       16285 :         return 0;
    1770             : }
    1771             : 
    1772       16285 : static int journal_file_append_entry_internal(
    1773             :                 JournalFile *f,
    1774             :                 const dual_timestamp *ts,
    1775             :                 const sd_id128_t *boot_id,
    1776             :                 uint64_t xor_hash,
    1777             :                 const EntryItem items[], unsigned n_items,
    1778             :                 uint64_t *seqnum,
    1779             :                 Object **ret, uint64_t *offset) {
    1780             :         uint64_t np;
    1781             :         uint64_t osize;
    1782             :         Object *o;
    1783             :         int r;
    1784             : 
    1785       16285 :         assert(f);
    1786       16285 :         assert(f->header);
    1787       16285 :         assert(items || n_items == 0);
    1788       16285 :         assert(ts);
    1789             : 
    1790       16285 :         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
    1791             : 
    1792       16285 :         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
    1793       16285 :         if (r < 0)
    1794           0 :                 return r;
    1795             : 
    1796       16285 :         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
    1797       16285 :         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
    1798       16285 :         o->entry.realtime = htole64(ts->realtime);
    1799       16285 :         o->entry.monotonic = htole64(ts->monotonic);
    1800       16285 :         o->entry.xor_hash = htole64(xor_hash);
    1801       16285 :         if (boot_id)
    1802       10001 :                 f->header->boot_id = *boot_id;
    1803       16285 :         o->entry.boot_id = f->header->boot_id;
    1804             : 
    1805             : #if HAVE_GCRYPT
    1806       16285 :         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
    1807       16285 :         if (r < 0)
    1808           0 :                 return r;
    1809             : #endif
    1810             : 
    1811       16285 :         r = journal_file_link_entry(f, o, np);
    1812       16285 :         if (r < 0)
    1813           0 :                 return r;
    1814             : 
    1815       16285 :         if (ret)
    1816           0 :                 *ret = o;
    1817             : 
    1818       16285 :         if (offset)
    1819           0 :                 *offset = np;
    1820             : 
    1821       16285 :         return 0;
    1822             : }
    1823             : 
    1824        6285 : void journal_file_post_change(JournalFile *f) {
    1825        6285 :         assert(f);
    1826             : 
    1827        6285 :         if (f->fd < 0)
    1828           0 :                 return;
    1829             : 
    1830             :         /* inotify() does not receive IN_MODIFY events from file
    1831             :          * accesses done via mmap(). After each access we hence
    1832             :          * trigger IN_MODIFY by truncating the journal file to its
    1833             :          * current size which triggers IN_MODIFY. */
    1834             : 
    1835        6285 :         __sync_synchronize();
    1836             : 
    1837        6285 :         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
    1838           0 :                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
    1839             : }
    1840             : 
    1841           0 : static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
    1842           0 :         assert(userdata);
    1843             : 
    1844           0 :         journal_file_post_change(userdata);
    1845             : 
    1846           0 :         return 1;
    1847             : }
    1848             : 
    1849           0 : static void schedule_post_change(JournalFile *f) {
    1850             :         uint64_t now;
    1851             :         int r;
    1852             : 
    1853           0 :         assert(f);
    1854           0 :         assert(f->post_change_timer);
    1855             : 
    1856           0 :         r = sd_event_source_get_enabled(f->post_change_timer, NULL);
    1857           0 :         if (r < 0) {
    1858           0 :                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
    1859           0 :                 goto fail;
    1860             :         }
    1861           0 :         if (r > 0)
    1862           0 :                 return;
    1863             : 
    1864           0 :         r = sd_event_now(sd_event_source_get_event(f->post_change_timer), CLOCK_MONOTONIC, &now);
    1865           0 :         if (r < 0) {
    1866           0 :                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
    1867           0 :                 goto fail;
    1868             :         }
    1869             : 
    1870           0 :         r = sd_event_source_set_time(f->post_change_timer, now + f->post_change_timer_period);
    1871           0 :         if (r < 0) {
    1872           0 :                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
    1873           0 :                 goto fail;
    1874             :         }
    1875             : 
    1876           0 :         r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
    1877           0 :         if (r < 0) {
    1878           0 :                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
    1879           0 :                 goto fail;
    1880             :         }
    1881             : 
    1882           0 :         return;
    1883             : 
    1884           0 : fail:
    1885             :         /* On failure, let's simply post the change immediately. */
    1886           0 :         journal_file_post_change(f);
    1887             : }
    1888             : 
    1889             : /* Enable coalesced change posting in a timer on the provided sd_event instance */
    1890           0 : int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
    1891           0 :         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
    1892             :         int r;
    1893             : 
    1894           0 :         assert(f);
    1895           0 :         assert_return(!f->post_change_timer, -EINVAL);
    1896           0 :         assert(e);
    1897           0 :         assert(t);
    1898             : 
    1899           0 :         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
    1900           0 :         if (r < 0)
    1901           0 :                 return r;
    1902             : 
    1903           0 :         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
    1904           0 :         if (r < 0)
    1905           0 :                 return r;
    1906             : 
    1907           0 :         f->post_change_timer = TAKE_PTR(timer);
    1908           0 :         f->post_change_timer_period = t;
    1909             : 
    1910           0 :         return r;
    1911             : }
    1912             : 
    1913         260 : static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
    1914         260 :         return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
    1915             : }
    1916             : 
    1917        6285 : int journal_file_append_entry(
    1918             :                 JournalFile *f,
    1919             :                 const dual_timestamp *ts,
    1920             :                 const sd_id128_t *boot_id,
    1921             :                 const struct iovec iovec[], unsigned n_iovec,
    1922             :                 uint64_t *seqnum,
    1923             :                 Object **ret, uint64_t *offset) {
    1924             : 
    1925             :         unsigned i;
    1926             :         EntryItem *items;
    1927             :         int r;
    1928        6285 :         uint64_t xor_hash = 0;
    1929             :         struct dual_timestamp _ts;
    1930             : 
    1931        6285 :         assert(f);
    1932        6285 :         assert(f->header);
    1933        6285 :         assert(iovec || n_iovec == 0);
    1934             : 
    1935        6285 :         if (ts) {
    1936        6285 :                 if (!VALID_REALTIME(ts->realtime))
    1937           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
    1938             :                                                "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
    1939             :                                                ts->realtime);
    1940        6285 :                 if (!VALID_MONOTONIC(ts->monotonic))
    1941           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
    1942             :                                                "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
    1943             :                                                ts->monotonic);
    1944             :         } else {
    1945           0 :                 dual_timestamp_get(&_ts);
    1946           0 :                 ts = &_ts;
    1947             :         }
    1948             : 
    1949             : #if HAVE_GCRYPT
    1950        6285 :         r = journal_file_maybe_append_tag(f, ts->realtime);
    1951        6285 :         if (r < 0)
    1952           0 :                 return r;
    1953             : #endif
    1954             : 
    1955             :         /* alloca() can't take 0, hence let's allocate at least one */
    1956        6285 :         items = newa(EntryItem, MAX(1u, n_iovec));
    1957             : 
    1958       12830 :         for (i = 0; i < n_iovec; i++) {
    1959             :                 uint64_t p;
    1960             :                 Object *o;
    1961             : 
    1962        6545 :                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
    1963        6545 :                 if (r < 0)
    1964           0 :                         return r;
    1965             : 
    1966        6545 :                 xor_hash ^= le64toh(o->data.hash);
    1967        6545 :                 items[i].object_offset = htole64(p);
    1968        6545 :                 items[i].hash = o->data.hash;
    1969             :         }
    1970             : 
    1971             :         /* Order by the position on disk, in order to improve seek
    1972             :          * times for rotating media. */
    1973        6285 :         typesafe_qsort(items, n_iovec, entry_item_cmp);
    1974             : 
    1975        6285 :         r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
    1976             : 
    1977             :         /* If the memory mapping triggered a SIGBUS then we return an
    1978             :          * IO error and ignore the error code passed down to us, since
    1979             :          * it is very likely just an effect of a nullified replacement
    1980             :          * mapping page */
    1981             : 
    1982        6285 :         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
    1983           0 :                 r = -EIO;
    1984             : 
    1985        6285 :         if (f->post_change_timer)
    1986           0 :                 schedule_post_change(f);
    1987             :         else
    1988        6285 :                 journal_file_post_change(f);
    1989             : 
    1990        6285 :         return r;
    1991             : }
    1992             : 
    1993             : typedef struct ChainCacheItem {
    1994             :         uint64_t first; /* the array at the beginning of the chain */
    1995             :         uint64_t array; /* the cached array */
    1996             :         uint64_t begin; /* the first item in the cached array */
    1997             :         uint64_t total; /* the total number of items in all arrays before this one in the chain */
    1998             :         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
    1999             : } ChainCacheItem;
    2000             : 
    2001       21653 : static void chain_cache_put(
    2002             :                 OrderedHashmap *h,
    2003             :                 ChainCacheItem *ci,
    2004             :                 uint64_t first,
    2005             :                 uint64_t array,
    2006             :                 uint64_t begin,
    2007             :                 uint64_t total,
    2008             :                 uint64_t last_index) {
    2009             : 
    2010       21653 :         if (!ci) {
    2011             :                 /* If the chain item to cache for this chain is the
    2012             :                  * first one it's not worth caching anything */
    2013         354 :                 if (array == first)
    2014         243 :                         return;
    2015             : 
    2016         111 :                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
    2017           0 :                         ci = ordered_hashmap_steal_first(h);
    2018           0 :                         assert(ci);
    2019             :                 } else {
    2020         111 :                         ci = new(ChainCacheItem, 1);
    2021         111 :                         if (!ci)
    2022           0 :                                 return;
    2023             :                 }
    2024             : 
    2025         111 :                 ci->first = first;
    2026             : 
    2027         111 :                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
    2028           0 :                         free(ci);
    2029           0 :                         return;
    2030             :                 }
    2031             :         } else
    2032       21299 :                 assert(ci->first == first);
    2033             : 
    2034       21410 :         ci->array = array;
    2035       21410 :         ci->begin = begin;
    2036       21410 :         ci->total = total;
    2037       21410 :         ci->last_index = last_index;
    2038             : }
    2039             : 
    2040       10947 : static int generic_array_get(
    2041             :                 JournalFile *f,
    2042             :                 uint64_t first,
    2043             :                 uint64_t i,
    2044             :                 Object **ret, uint64_t *offset) {
    2045             : 
    2046             :         Object *o;
    2047       10947 :         uint64_t p = 0, a, t = 0;
    2048             :         int r;
    2049             :         ChainCacheItem *ci;
    2050             : 
    2051       10947 :         assert(f);
    2052             : 
    2053       10947 :         a = first;
    2054             : 
    2055             :         /* Try the chain cache first */
    2056       10947 :         ci = ordered_hashmap_get(f->chain_cache, &first);
    2057       10947 :         if (ci && i > ci->total) {
    2058       10674 :                 a = ci->array;
    2059       10674 :                 i -= ci->total;
    2060       10674 :                 t = ci->total;
    2061             :         }
    2062             : 
    2063       11811 :         while (a > 0) {
    2064             :                 uint64_t k;
    2065             : 
    2066       11811 :                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
    2067       11811 :                 if (r < 0)
    2068           0 :                         return r;
    2069             : 
    2070       11811 :                 k = journal_file_entry_array_n_items(o);
    2071       11811 :                 if (i < k) {
    2072       10947 :                         p = le64toh(o->entry_array.items[i]);
    2073       10947 :                         goto found;
    2074             :                 }
    2075             : 
    2076         864 :                 i -= k;
    2077         864 :                 t += k;
    2078         864 :                 a = le64toh(o->entry_array.next_entry_array_offset);
    2079             :         }
    2080             : 
    2081           0 :         return 0;
    2082             : 
    2083       10947 : found:
    2084             :         /* Let's cache this item for the next invocation */
    2085       10947 :         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
    2086             : 
    2087       10947 :         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
    2088       10947 :         if (r < 0)
    2089         438 :                 return r;
    2090             : 
    2091       10509 :         if (ret)
    2092       10404 :                 *ret = o;
    2093             : 
    2094       10509 :         if (offset)
    2095       10509 :                 *offset = p;
    2096             : 
    2097       10509 :         return 1;
    2098             : }
    2099             : 
    2100         559 : static int generic_array_get_plus_one(
    2101             :                 JournalFile *f,
    2102             :                 uint64_t extra,
    2103             :                 uint64_t first,
    2104             :                 uint64_t i,
    2105             :                 Object **ret, uint64_t *offset) {
    2106             : 
    2107             :         Object *o;
    2108             : 
    2109         559 :         assert(f);
    2110             : 
    2111         559 :         if (i == 0) {
    2112             :                 int r;
    2113             : 
    2114          15 :                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
    2115          15 :                 if (r < 0)
    2116           0 :                         return r;
    2117             : 
    2118          15 :                 if (ret)
    2119           3 :                         *ret = o;
    2120             : 
    2121          15 :                 if (offset)
    2122          15 :                         *offset = extra;
    2123             : 
    2124          15 :                 return 1;
    2125             :         }
    2126             : 
    2127         544 :         return generic_array_get(f, first, i-1, ret, offset);
    2128             : }
    2129             : 
    2130             : enum {
    2131             :         TEST_FOUND,
    2132             :         TEST_LEFT,
    2133             :         TEST_RIGHT
    2134             : };
    2135             : 
    2136       10720 : static int generic_array_bisect(
    2137             :                 JournalFile *f,
    2138             :                 uint64_t first,
    2139             :                 uint64_t n,
    2140             :                 uint64_t needle,
    2141             :                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
    2142             :                 direction_t direction,
    2143             :                 Object **ret,
    2144             :                 uint64_t *offset,
    2145             :                 uint64_t *idx) {
    2146             : 
    2147       10720 :         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
    2148       10720 :         bool subtract_one = false;
    2149       10720 :         Object *o, *array = NULL;
    2150             :         int r;
    2151             :         ChainCacheItem *ci;
    2152             : 
    2153       10720 :         assert(f);
    2154       10720 :         assert(test_object);
    2155             : 
    2156             :         /* Start with the first array in the chain */
    2157       10720 :         a = first;
    2158             : 
    2159       10720 :         ci = ordered_hashmap_get(f->chain_cache, &first);
    2160       10720 :         if (ci && n > ci->total && ci->begin != 0) {
    2161             :                 /* Ah, we have iterated this bisection array chain
    2162             :                  * previously! Let's see if we can skip ahead in the
    2163             :                  * chain, as far as the last time. But we can't jump
    2164             :                  * backwards in the chain, so let's check that
    2165             :                  * first. */
    2166             : 
    2167       10634 :                 r = test_object(f, ci->begin, needle);
    2168       10634 :                 if (r < 0)
    2169           0 :                         return r;
    2170             : 
    2171       10634 :                 if (r == TEST_LEFT) {
    2172             :                         /* OK, what we are looking for is right of the
    2173             :                          * begin of this EntryArray, so let's jump
    2174             :                          * straight to previously cached array in the
    2175             :                          * chain */
    2176             : 
    2177       10594 :                         a = ci->array;
    2178       10594 :                         n -= ci->total;
    2179       10594 :                         t = ci->total;
    2180       10594 :                         last_index = ci->last_index;
    2181             :                 }
    2182             :         }
    2183             : 
    2184       10811 :         while (a > 0) {
    2185             :                 uint64_t left, right, k, lp;
    2186             : 
    2187       10807 :                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
    2188       10807 :                 if (r < 0)
    2189           0 :                         return r;
    2190             : 
    2191       10807 :                 k = journal_file_entry_array_n_items(array);
    2192       10807 :                 right = MIN(k, n);
    2193       10807 :                 if (right <= 0)
    2194           0 :                         return 0;
    2195             : 
    2196       10807 :                 i = right - 1;
    2197       10807 :                 lp = p = le64toh(array->entry_array.items[i]);
    2198       10807 :                 if (p <= 0)
    2199           6 :                         r = -EBADMSG;
    2200             :                 else
    2201       10801 :                         r = test_object(f, p, needle);
    2202       10807 :                 if (r == -EBADMSG) {
    2203           6 :                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
    2204           6 :                         n = i;
    2205           6 :                         continue;
    2206             :                 }
    2207       10801 :                 if (r < 0)
    2208           0 :                         return r;
    2209             : 
    2210       10801 :                 if (r == TEST_FOUND)
    2211         153 :                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
    2212             : 
    2213       10801 :                 if (r == TEST_RIGHT) {
    2214       10602 :                         left = 0;
    2215       10602 :                         right -= 1;
    2216             : 
    2217       10602 :                         if (last_index != (uint64_t) -1) {
    2218       10475 :                                 assert(last_index <= right);
    2219             : 
    2220             :                                 /* If we cached the last index we
    2221             :                                  * looked at, let's try to not to jump
    2222             :                                  * too wildly around and see if we can
    2223             :                                  * limit the range to look at early to
    2224             :                                  * the immediate neighbors of the last
    2225             :                                  * index we looked at. */
    2226             : 
    2227       10475 :                                 if (last_index > 0) {
    2228       10459 :                                         uint64_t x = last_index - 1;
    2229             : 
    2230       10459 :                                         p = le64toh(array->entry_array.items[x]);
    2231       10459 :                                         if (p <= 0)
    2232           0 :                                                 return -EBADMSG;
    2233             : 
    2234       10459 :                                         r = test_object(f, p, needle);
    2235       10459 :                                         if (r < 0)
    2236           0 :                                                 return r;
    2237             : 
    2238       10459 :                                         if (r == TEST_FOUND)
    2239          10 :                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
    2240             : 
    2241       10459 :                                         if (r == TEST_RIGHT)
    2242          59 :                                                 right = x;
    2243             :                                         else
    2244       10400 :                                                 left = x + 1;
    2245             :                                 }
    2246             : 
    2247       10475 :                                 if (last_index < right) {
    2248       10384 :                                         uint64_t y = last_index + 1;
    2249             : 
    2250       10384 :                                         p = le64toh(array->entry_array.items[y]);
    2251       10384 :                                         if (p <= 0)
    2252           0 :                                                 return -EBADMSG;
    2253             : 
    2254       10384 :                                         r = test_object(f, p, needle);
    2255       10384 :                                         if (r < 0)
    2256           0 :                                                 return r;
    2257             : 
    2258       10384 :                                         if (r == TEST_FOUND)
    2259           1 :                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
    2260             : 
    2261       10384 :                                         if (r == TEST_RIGHT)
    2262       10384 :                                                 right = y;
    2263             :                                         else
    2264           0 :                                                 left = y + 1;
    2265             :                                 }
    2266             :                         }
    2267             : 
    2268             :                         for (;;) {
    2269       22092 :                                 if (left == right) {
    2270       10602 :                                         if (direction == DIRECTION_UP)
    2271         196 :                                                 subtract_one = true;
    2272             : 
    2273       10602 :                                         i = left;
    2274       10602 :                                         goto found;
    2275             :                                 }
    2276             : 
    2277       11490 :                                 assert(left < right);
    2278       11490 :                                 i = (left + right) / 2;
    2279             : 
    2280       11490 :                                 p = le64toh(array->entry_array.items[i]);
    2281       11490 :                                 if (p <= 0)
    2282           0 :                                         r = -EBADMSG;
    2283             :                                 else
    2284       11490 :                                         r = test_object(f, p, needle);
    2285       11490 :                                 if (r == -EBADMSG) {
    2286           0 :                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
    2287           0 :                                         right = n = i;
    2288           0 :                                         continue;
    2289             :                                 }
    2290       11490 :                                 if (r < 0)
    2291           0 :                                         return r;
    2292             : 
    2293       11490 :                                 if (r == TEST_FOUND)
    2294       10367 :                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
    2295             : 
    2296       11490 :                                 if (r == TEST_RIGHT)
    2297       10620 :                                         right = i;
    2298             :                                 else
    2299         870 :                                         left = i + 1;
    2300             :                         }
    2301             :                 }
    2302             : 
    2303         199 :                 if (k >= n) {
    2304         114 :                         if (direction == DIRECTION_UP) {
    2305         107 :                                 i = n;
    2306         107 :                                 subtract_one = true;
    2307         107 :                                 goto found;
    2308             :                         }
    2309             : 
    2310           7 :                         return 0;
    2311             :                 }
    2312             : 
    2313          85 :                 last_p = lp;
    2314             : 
    2315          85 :                 n -= k;
    2316          85 :                 t += k;
    2317          85 :                 last_index = (uint64_t) -1;
    2318          85 :                 a = le64toh(array->entry_array.next_entry_array_offset);
    2319             :         }
    2320             : 
    2321           4 :         return 0;
    2322             : 
    2323       10709 : found:
    2324       10709 :         if (subtract_one && t == 0 && i == 0)
    2325           3 :                 return 0;
    2326             : 
    2327             :         /* Let's cache this item for the next invocation */
    2328       10706 :         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
    2329             : 
    2330       10706 :         if (subtract_one && i == 0)
    2331           5 :                 p = last_p;
    2332       10701 :         else if (subtract_one)
    2333         295 :                 p = le64toh(array->entry_array.items[i-1]);
    2334             :         else
    2335       10406 :                 p = le64toh(array->entry_array.items[i]);
    2336             : 
    2337       10706 :         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
    2338       10706 :         if (r < 0)
    2339           0 :                 return r;
    2340             : 
    2341       10706 :         if (ret)
    2342          11 :                 *ret = o;
    2343             : 
    2344       10706 :         if (offset)
    2345         397 :                 *offset = p;
    2346             : 
    2347       10706 :         if (idx)
    2348       10306 :                 *idx = t + i + (subtract_one ? -1 : 0);
    2349             : 
    2350       10706 :         return 1;
    2351             : }
    2352             : 
    2353         424 : static int generic_array_bisect_plus_one(
    2354             :                 JournalFile *f,
    2355             :                 uint64_t extra,
    2356             :                 uint64_t first,
    2357             :                 uint64_t n,
    2358             :                 uint64_t needle,
    2359             :                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
    2360             :                 direction_t direction,
    2361             :                 Object **ret,
    2362             :                 uint64_t *offset,
    2363             :                 uint64_t *idx) {
    2364             : 
    2365             :         int r;
    2366         424 :         bool step_back = false;
    2367             :         Object *o;
    2368             : 
    2369         424 :         assert(f);
    2370         424 :         assert(test_object);
    2371             : 
    2372         424 :         if (n <= 0)
    2373           0 :                 return 0;
    2374             : 
    2375             :         /* This bisects the array in object 'first', but first checks
    2376             :          * an extra  */
    2377         424 :         r = test_object(f, extra, needle);
    2378         424 :         if (r < 0)
    2379           0 :                 return r;
    2380             : 
    2381         424 :         if (r == TEST_FOUND)
    2382          16 :                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
    2383             : 
    2384             :         /* if we are looking with DIRECTION_UP then we need to first
    2385             :            see if in the actual array there is a matching entry, and
    2386             :            return the last one of that. But if there isn't any we need
    2387             :            to return this one. Hence remember this, and return it
    2388             :            below. */
    2389         424 :         if (r == TEST_LEFT)
    2390         402 :                 step_back = direction == DIRECTION_UP;
    2391             : 
    2392         424 :         if (r == TEST_RIGHT) {
    2393          22 :                 if (direction == DIRECTION_DOWN)
    2394          19 :                         goto found;
    2395             :                 else
    2396           3 :                         return 0;
    2397             :         }
    2398             : 
    2399         402 :         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
    2400             : 
    2401         402 :         if (r == 0 && step_back)
    2402           3 :                 goto found;
    2403             : 
    2404         399 :         if (r > 0 && idx)
    2405           0 :                 (*idx)++;
    2406             : 
    2407         399 :         return r;
    2408             : 
    2409          22 : found:
    2410          22 :         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
    2411          22 :         if (r < 0)
    2412           0 :                 return r;
    2413             : 
    2414          22 :         if (ret)
    2415           0 :                 *ret = o;
    2416             : 
    2417          22 :         if (offset)
    2418          22 :                 *offset = extra;
    2419             : 
    2420          22 :         if (idx)
    2421           0 :                 *idx = 0;
    2422             : 
    2423          22 :         return 1;
    2424             : }
    2425             : 
    2426       54171 : _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
    2427       54171 :         assert(f);
    2428       54171 :         assert(p > 0);
    2429             : 
    2430       54171 :         if (p == needle)
    2431       10561 :                 return TEST_FOUND;
    2432       43610 :         else if (p < needle)
    2433       22245 :                 return TEST_LEFT;
    2434             :         else
    2435       21365 :                 return TEST_RIGHT;
    2436             : }
    2437             : 
    2438          15 : static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
    2439             :         Object *o;
    2440             :         int r;
    2441             : 
    2442          15 :         assert(f);
    2443          15 :         assert(p > 0);
    2444             : 
    2445          15 :         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
    2446          15 :         if (r < 0)
    2447           0 :                 return r;
    2448             : 
    2449          15 :         if (le64toh(o->entry.seqnum) == needle)
    2450           7 :                 return TEST_FOUND;
    2451           8 :         else if (le64toh(o->entry.seqnum) < needle)
    2452           3 :                 return TEST_LEFT;
    2453             :         else
    2454           5 :                 return TEST_RIGHT;
    2455             : }
    2456             : 
    2457           8 : int journal_file_move_to_entry_by_seqnum(
    2458             :                 JournalFile *f,
    2459             :                 uint64_t seqnum,
    2460             :                 direction_t direction,
    2461             :                 Object **ret,
    2462             :                 uint64_t *offset) {
    2463           8 :         assert(f);
    2464           8 :         assert(f->header);
    2465             : 
    2466           8 :         return generic_array_bisect(f,
    2467           8 :                                     le64toh(f->header->entry_array_offset),
    2468           8 :                                     le64toh(f->header->n_entries),
    2469             :                                     seqnum,
    2470             :                                     test_object_seqnum,
    2471             :                                     direction,
    2472             :                                     ret, offset, NULL);
    2473             : }
    2474             : 
    2475           6 : static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
    2476             :         Object *o;
    2477             :         int r;
    2478             : 
    2479           6 :         assert(f);
    2480           6 :         assert(p > 0);
    2481             : 
    2482           6 :         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
    2483           6 :         if (r < 0)
    2484           0 :                 return r;
    2485             : 
    2486           6 :         if (le64toh(o->entry.realtime) == needle)
    2487           0 :                 return TEST_FOUND;
    2488           6 :         else if (le64toh(o->entry.realtime) < needle)
    2489           2 :                 return TEST_LEFT;
    2490             :         else
    2491           4 :                 return TEST_RIGHT;
    2492             : }
    2493             : 
    2494           4 : int journal_file_move_to_entry_by_realtime(
    2495             :                 JournalFile *f,
    2496             :                 uint64_t realtime,
    2497             :                 direction_t direction,
    2498             :                 Object **ret,
    2499             :                 uint64_t *offset) {
    2500           4 :         assert(f);
    2501           4 :         assert(f->header);
    2502             : 
    2503           4 :         return generic_array_bisect(f,
    2504           4 :                                     le64toh(f->header->entry_array_offset),
    2505           4 :                                     le64toh(f->header->n_entries),
    2506             :                                     realtime,
    2507             :                                     test_object_realtime,
    2508             :                                     direction,
    2509             :                                     ret, offset, NULL);
    2510             : }
    2511             : 
    2512           0 : static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
    2513             :         Object *o;
    2514             :         int r;
    2515             : 
    2516           0 :         assert(f);
    2517           0 :         assert(p > 0);
    2518             : 
    2519           0 :         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
    2520           0 :         if (r < 0)
    2521           0 :                 return r;
    2522             : 
    2523           0 :         if (le64toh(o->entry.monotonic) == needle)
    2524           0 :                 return TEST_FOUND;
    2525           0 :         else if (le64toh(o->entry.monotonic) < needle)
    2526           0 :                 return TEST_LEFT;
    2527             :         else
    2528           0 :                 return TEST_RIGHT;
    2529             : }
    2530             : 
    2531           4 : static int find_data_object_by_boot_id(
    2532             :                 JournalFile *f,
    2533             :                 sd_id128_t boot_id,
    2534             :                 Object **o,
    2535             :                 uint64_t *b) {
    2536             : 
    2537           4 :         char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
    2538             : 
    2539           4 :         sd_id128_to_string(boot_id, t + 9);
    2540           4 :         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
    2541             : }
    2542             : 
    2543           4 : int journal_file_move_to_entry_by_monotonic(
    2544             :                 JournalFile *f,
    2545             :                 sd_id128_t boot_id,
    2546             :                 uint64_t monotonic,
    2547             :                 direction_t direction,
    2548             :                 Object **ret,
    2549             :                 uint64_t *offset) {
    2550             : 
    2551             :         Object *o;
    2552             :         int r;
    2553             : 
    2554           4 :         assert(f);
    2555             : 
    2556           4 :         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
    2557           4 :         if (r < 0)
    2558           0 :                 return r;
    2559           4 :         if (r == 0)
    2560           4 :                 return -ENOENT;
    2561             : 
    2562           0 :         return generic_array_bisect_plus_one(f,
    2563           0 :                                              le64toh(o->data.entry_offset),
    2564           0 :                                              le64toh(o->data.entry_array_offset),
    2565           0 :                                              le64toh(o->data.n_entries),
    2566             :                                              monotonic,
    2567             :                                              test_object_monotonic,
    2568             :                                              direction,
    2569             :                                              ret, offset, NULL);
    2570             : }
    2571             : 
    2572       11974 : void journal_file_reset_location(JournalFile *f) {
    2573       11974 :         f->location_type = LOCATION_HEAD;
    2574       11974 :         f->current_offset = 0;
    2575       11974 :         f->current_seqnum = 0;
    2576       11974 :         f->current_realtime = 0;
    2577       11974 :         f->current_monotonic = 0;
    2578       11974 :         zero(f->current_boot_id);
    2579       11974 :         f->current_xor_hash = 0;
    2580       11974 : }
    2581             : 
    2582       10611 : void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
    2583       10611 :         f->location_type = LOCATION_SEEK;
    2584       10611 :         f->current_offset = offset;
    2585       10611 :         f->current_seqnum = le64toh(o->entry.seqnum);
    2586       10611 :         f->current_realtime = le64toh(o->entry.realtime);
    2587       10611 :         f->current_monotonic = le64toh(o->entry.monotonic);
    2588       10611 :         f->current_boot_id = o->entry.boot_id;
    2589       10611 :         f->current_xor_hash = le64toh(o->entry.xor_hash);
    2590       10611 : }
    2591             : 
    2592      991152 : int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
    2593             :         int r;
    2594             : 
    2595      991152 :         assert(af);
    2596      991152 :         assert(af->header);
    2597      991152 :         assert(bf);
    2598      991152 :         assert(bf->header);
    2599      991152 :         assert(af->location_type == LOCATION_SEEK);
    2600      991152 :         assert(bf->location_type == LOCATION_SEEK);
    2601             : 
    2602             :         /* If contents and timestamps match, these entries are
    2603             :          * identical, even if the seqnum does not match */
    2604      991152 :         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
    2605       20682 :             af->current_monotonic == bf->current_monotonic &&
    2606          82 :             af->current_realtime == bf->current_realtime &&
    2607          82 :             af->current_xor_hash == bf->current_xor_hash)
    2608          82 :                 return 0;
    2609             : 
    2610      991070 :         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
    2611             : 
    2612             :                 /* If this is from the same seqnum source, compare
    2613             :                  * seqnums */
    2614        9697 :                 r = CMP(af->current_seqnum, bf->current_seqnum);
    2615        9697 :                 if (r != 0)
    2616        9697 :                         return r;
    2617             : 
    2618             :                 /* Wow! This is weird, different data but the same
    2619             :                  * seqnums? Something is borked, but let's make the
    2620             :                  * best of it and compare by time. */
    2621             :         }
    2622             : 
    2623      981373 :         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
    2624             : 
    2625             :                 /* If the boot id matches, compare monotonic time */
    2626       20570 :                 r = CMP(af->current_monotonic, bf->current_monotonic);
    2627       20570 :                 if (r != 0)
    2628       20570 :                         return r;
    2629             :         }
    2630             : 
    2631             :         /* Otherwise, compare UTC time */
    2632      960803 :         r = CMP(af->current_realtime, bf->current_realtime);
    2633      960803 :         if (r != 0)
    2634      960803 :                 return r;
    2635             : 
    2636             :         /* Finally, compare by contents */
    2637           0 :         return CMP(af->current_xor_hash, bf->current_xor_hash);
    2638             : }
    2639             : 
    2640       10744 : static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
    2641             : 
    2642             :         /* Increase or decrease the specified index, in the right direction. */
    2643             : 
    2644       10744 :         if (direction == DIRECTION_DOWN) {
    2645       10284 :                 if (*i >= n - 1)
    2646          14 :                         return 0;
    2647             : 
    2648       10270 :                 (*i) ++;
    2649             :         } else {
    2650         460 :                 if (*i <= 0)
    2651          10 :                         return 0;
    2652             : 
    2653         450 :                 (*i) --;
    2654             :         }
    2655             : 
    2656       10720 :         return 1;
    2657             : }
    2658             : 
    2659       10282 : static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
    2660             : 
    2661             :         /* Consider it an error if any of the two offsets is uninitialized */
    2662       10282 :         if (old_offset == 0 || new_offset == 0)
    2663           0 :                 return false;
    2664             : 
    2665             :         /* If we go down, the new offset must be larger than the old one. */
    2666             :         return direction == DIRECTION_DOWN ?
    2667       10282 :                 new_offset > old_offset  :
    2668             :                 new_offset < old_offset;
    2669             : }
    2670             : 
    2671       10428 : int journal_file_next_entry(
    2672             :                 JournalFile *f,
    2673             :                 uint64_t p,
    2674             :                 direction_t direction,
    2675             :                 Object **ret, uint64_t *offset) {
    2676             : 
    2677             :         uint64_t i, n, ofs;
    2678             :         int r;
    2679             : 
    2680       10428 :         assert(f);
    2681       10428 :         assert(f->header);
    2682             : 
    2683       10428 :         n = le64toh(f->header->n_entries);
    2684       10428 :         if (n <= 0)
    2685           1 :                 return 0;
    2686             : 
    2687       10427 :         if (p == 0)
    2688         121 :                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
    2689             :         else {
    2690       10306 :                 r = generic_array_bisect(f,
    2691       10306 :                                          le64toh(f->header->entry_array_offset),
    2692       10306 :                                          le64toh(f->header->n_entries),
    2693             :                                          p,
    2694             :                                          test_object_offset,
    2695             :                                          DIRECTION_DOWN,
    2696             :                                          NULL, NULL,
    2697             :                                          &i);
    2698       10306 :                 if (r <= 0)
    2699           0 :                         return r;
    2700             : 
    2701       10306 :                 r = bump_array_index(&i, direction, n);
    2702       10306 :                 if (r <= 0)
    2703          24 :                         return r;
    2704             :         }
    2705             : 
    2706             :         /* And jump to it */
    2707             :         for (;;) {
    2708       10403 :                 r = generic_array_get(f,
    2709       10403 :                                       le64toh(f->header->entry_array_offset),
    2710             :                                       i,
    2711             :                                       ret, &ofs);
    2712       10403 :                 if (r > 0)
    2713       10403 :                         break;
    2714           0 :                 if (r != -EBADMSG)
    2715           0 :                         return r;
    2716             : 
    2717             :                 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
    2718             :                  * the next one might work for us instead. */
    2719           0 :                 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
    2720             : 
    2721           0 :                 r = bump_array_index(&i, direction, n);
    2722           0 :                 if (r <= 0)
    2723           0 :                         return r;
    2724             :         }
    2725             : 
    2726             :         /* Ensure our array is properly ordered. */
    2727       10403 :         if (p > 0 && !check_properly_ordered(ofs, p, direction))
    2728           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
    2729             :                                        "%s: entry array not properly ordered at entry %" PRIu64,
    2730             :                                        f->path, i);
    2731             : 
    2732       10403 :         if (offset)
    2733       10403 :                 *offset = ofs;
    2734             : 
    2735       10403 :         return 1;
    2736             : }
    2737             : 
    2738         121 : int journal_file_next_entry_for_data(
    2739             :                 JournalFile *f,
    2740             :                 Object *o, uint64_t p,
    2741             :                 uint64_t data_offset,
    2742             :                 direction_t direction,
    2743             :                 Object **ret, uint64_t *offset) {
    2744             : 
    2745             :         uint64_t i, n, ofs;
    2746             :         Object *d;
    2747             :         int r;
    2748             : 
    2749         121 :         assert(f);
    2750         121 :         assert(p > 0 || !o);
    2751             : 
    2752         121 :         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
    2753         121 :         if (r < 0)
    2754           0 :                 return r;
    2755             : 
    2756         121 :         n = le64toh(d->data.n_entries);
    2757         121 :         if (n <= 0)
    2758           0 :                 return n;
    2759             : 
    2760         121 :         if (!o)
    2761         121 :                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
    2762             :         else {
    2763           0 :                 if (o->object.type != OBJECT_ENTRY)
    2764           0 :                         return -EINVAL;
    2765             : 
    2766           0 :                 r = generic_array_bisect_plus_one(f,
    2767           0 :                                                   le64toh(d->data.entry_offset),
    2768           0 :                                                   le64toh(d->data.entry_array_offset),
    2769           0 :                                                   le64toh(d->data.n_entries),
    2770             :                                                   p,
    2771             :                                                   test_object_offset,
    2772             :                                                   DIRECTION_DOWN,
    2773             :                                                   NULL, NULL,
    2774             :                                                   &i);
    2775             : 
    2776           0 :                 if (r <= 0)
    2777           0 :                         return r;
    2778             : 
    2779           0 :                 r = bump_array_index(&i, direction, n);
    2780           0 :                 if (r <= 0)
    2781           0 :                         return r;
    2782             :         }
    2783             : 
    2784             :         for (;;) {
    2785         997 :                 r = generic_array_get_plus_one(f,
    2786         559 :                                                le64toh(d->data.entry_offset),
    2787         559 :                                                le64toh(d->data.entry_array_offset),
    2788             :                                                i,
    2789             :                                                ret, &ofs);
    2790         559 :                 if (r > 0)
    2791         121 :                         break;
    2792         438 :                 if (r != -EBADMSG)
    2793           0 :                         return r;
    2794             : 
    2795         438 :                 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
    2796             : 
    2797         438 :                 r = bump_array_index(&i, direction, n);
    2798         438 :                 if (r <= 0)
    2799           0 :                         return r;
    2800             :         }
    2801             : 
    2802             :         /* Ensure our array is properly ordered. */
    2803         121 :         if (p > 0 && check_properly_ordered(ofs, p, direction))
    2804           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
    2805             :                                        "%s data entry array not properly ordered at entry %" PRIu64,
    2806             :                                        f->path, i);
    2807             : 
    2808         121 :         if (offset)
    2809         117 :                 *offset = ofs;
    2810             : 
    2811         121 :         return 1;
    2812             : }
    2813             : 
    2814         424 : int journal_file_move_to_entry_by_offset_for_data(
    2815             :                 JournalFile *f,
    2816             :                 uint64_t data_offset,
    2817             :                 uint64_t p,
    2818             :                 direction_t direction,
    2819             :                 Object **ret, uint64_t *offset) {
    2820             : 
    2821             :         int r;
    2822             :         Object *d;
    2823             : 
    2824         424 :         assert(f);
    2825             : 
    2826         424 :         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
    2827         424 :         if (r < 0)
    2828           0 :                 return r;
    2829             : 
    2830         424 :         return generic_array_bisect_plus_one(f,
    2831         424 :                                              le64toh(d->data.entry_offset),
    2832         424 :                                              le64toh(d->data.entry_array_offset),
    2833         424 :                                              le64toh(d->data.n_entries),
    2834             :                                              p,
    2835             :                                              test_object_offset,
    2836             :                                              direction,
    2837             :                                              ret, offset, NULL);
    2838             : }
    2839             : 
    2840           0 : int journal_file_move_to_entry_by_monotonic_for_data(
    2841             :                 JournalFile *f,
    2842             :                 uint64_t data_offset,
    2843             :                 sd_id128_t boot_id,
    2844             :                 uint64_t monotonic,
    2845             :                 direction_t direction,
    2846             :                 Object **ret, uint64_t *offset) {
    2847             : 
    2848             :         Object *o, *d;
    2849             :         int r;
    2850             :         uint64_t b, z;
    2851             : 
    2852           0 :         assert(f);
    2853             : 
    2854             :         /* First, seek by time */
    2855           0 :         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
    2856           0 :         if (r < 0)
    2857           0 :                 return r;
    2858           0 :         if (r == 0)
    2859           0 :                 return -ENOENT;
    2860             : 
    2861           0 :         r = generic_array_bisect_plus_one(f,
    2862           0 :                                           le64toh(o->data.entry_offset),
    2863           0 :                                           le64toh(o->data.entry_array_offset),
    2864           0 :                                           le64toh(o->data.n_entries),
    2865             :                                           monotonic,
    2866             :                                           test_object_monotonic,
    2867             :                                           direction,
    2868             :                                           NULL, &z, NULL);
    2869           0 :         if (r <= 0)
    2870           0 :                 return r;
    2871             : 
    2872             :         /* And now, continue seeking until we find an entry that
    2873             :          * exists in both bisection arrays */
    2874             : 
    2875           0 :         for (;;) {
    2876             :                 Object *qo;
    2877             :                 uint64_t p, q;
    2878             : 
    2879           0 :                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
    2880           0 :                 if (r < 0)
    2881           0 :                         return r;
    2882             : 
    2883           0 :                 r = generic_array_bisect_plus_one(f,
    2884           0 :                                                   le64toh(d->data.entry_offset),
    2885           0 :                                                   le64toh(d->data.entry_array_offset),
    2886           0 :                                                   le64toh(d->data.n_entries),
    2887             :                                                   z,
    2888             :                                                   test_object_offset,
    2889             :                                                   direction,
    2890             :                                                   NULL, &p, NULL);
    2891           0 :                 if (r <= 0)
    2892           0 :                         return r;
    2893             : 
    2894           0 :                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
    2895           0 :                 if (r < 0)
    2896           0 :                         return r;
    2897             : 
    2898           0 :                 r = generic_array_bisect_plus_one(f,
    2899           0 :                                                   le64toh(o->data.entry_offset),
    2900           0 :                                                   le64toh(o->data.entry_array_offset),
    2901           0 :                                                   le64toh(o->data.n_entries),
    2902             :                                                   p,
    2903             :                                                   test_object_offset,
    2904             :                                                   direction,
    2905             :                                                   &qo, &q, NULL);
    2906             : 
    2907           0 :                 if (r <= 0)
    2908           0 :                         return r;
    2909             : 
    2910           0 :                 if (p == q) {
    2911           0 :                         if (ret)
    2912           0 :                                 *ret = qo;
    2913           0 :                         if (offset)
    2914           0 :                                 *offset = q;
    2915             : 
    2916           0 :                         return 1;
    2917             :                 }
    2918             : 
    2919           0 :                 z = q;
    2920             :         }
    2921             : }
    2922             : 
    2923           0 : int journal_file_move_to_entry_by_seqnum_for_data(
    2924             :                 JournalFile *f,
    2925             :                 uint64_t data_offset,
    2926             :                 uint64_t seqnum,
    2927             :                 direction_t direction,
    2928             :                 Object **ret, uint64_t *offset) {
    2929             : 
    2930             :         Object *d;
    2931             :         int r;
    2932             : 
    2933           0 :         assert(f);
    2934             : 
    2935           0 :         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
    2936           0 :         if (r < 0)
    2937           0 :                 return r;
    2938             : 
    2939           0 :         return generic_array_bisect_plus_one(f,
    2940           0 :                                              le64toh(d->data.entry_offset),
    2941           0 :                                              le64toh(d->data.entry_array_offset),
    2942           0 :                                              le64toh(d->data.n_entries),
    2943             :                                              seqnum,
    2944             :                                              test_object_seqnum,
    2945             :                                              direction,
    2946             :                                              ret, offset, NULL);
    2947             : }
    2948             : 
    2949           0 : int journal_file_move_to_entry_by_realtime_for_data(
    2950             :                 JournalFile *f,
    2951             :                 uint64_t data_offset,
    2952             :                 uint64_t realtime,
    2953             :                 direction_t direction,
    2954             :                 Object **ret, uint64_t *offset) {
    2955             : 
    2956             :         Object *d;
    2957             :         int r;
    2958             : 
    2959           0 :         assert(f);
    2960             : 
    2961           0 :         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
    2962           0 :         if (r < 0)
    2963           0 :                 return r;
    2964             : 
    2965           0 :         return generic_array_bisect_plus_one(f,
    2966           0 :                                              le64toh(d->data.entry_offset),
    2967           0 :                                              le64toh(d->data.entry_array_offset),
    2968           0 :                                              le64toh(d->data.n_entries),
    2969             :                                              realtime,
    2970             :                                              test_object_realtime,
    2971             :                                              direction,
    2972             :                                              ret, offset, NULL);
    2973             : }
    2974             : 
    2975           9 : void journal_file_dump(JournalFile *f) {
    2976             :         Object *o;
    2977             :         int r;
    2978             :         uint64_t p;
    2979             : 
    2980           9 :         assert(f);
    2981           9 :         assert(f->header);
    2982             : 
    2983           9 :         journal_file_print_header(f);
    2984             : 
    2985           9 :         p = le64toh(f->header->header_size);
    2986        6451 :         while (p != 0) {
    2987        6442 :                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
    2988        6442 :                 if (r < 0)
    2989           0 :                         goto fail;
    2990             : 
    2991        6442 :                 switch (o->object.type) {
    2992             : 
    2993           0 :                 case OBJECT_UNUSED:
    2994           0 :                         printf("Type: OBJECT_UNUSED\n");
    2995           0 :                         break;
    2996             : 
    2997          86 :                 case OBJECT_DATA:
    2998          86 :                         printf("Type: OBJECT_DATA\n");
    2999          86 :                         break;
    3000             : 
    3001           3 :                 case OBJECT_FIELD:
    3002           3 :                         printf("Type: OBJECT_FIELD\n");
    3003           3 :                         break;
    3004             : 
    3005        6010 :                 case OBJECT_ENTRY:
    3006        6010 :                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
    3007        6010 :                                le64toh(o->entry.seqnum),
    3008        6010 :                                le64toh(o->entry.monotonic),
    3009        6010 :                                le64toh(o->entry.realtime));
    3010        6010 :                         break;
    3011             : 
    3012           9 :                 case OBJECT_FIELD_HASH_TABLE:
    3013           9 :                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
    3014           9 :                         break;
    3015             : 
    3016           9 :                 case OBJECT_DATA_HASH_TABLE:
    3017           9 :                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
    3018           9 :                         break;
    3019             : 
    3020         325 :                 case OBJECT_ENTRY_ARRAY:
    3021         325 :                         printf("Type: OBJECT_ENTRY_ARRAY\n");
    3022         325 :                         break;
    3023             : 
    3024           0 :                 case OBJECT_TAG:
    3025           0 :                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
    3026           0 :                                le64toh(o->tag.seqnum),
    3027           0 :                                le64toh(o->tag.epoch));
    3028           0 :                         break;
    3029             : 
    3030           0 :                 default:
    3031           0 :                         printf("Type: unknown (%i)\n", o->object.type);
    3032           0 :                         break;
    3033             :                 }
    3034             : 
    3035        6442 :                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
    3036           4 :                         printf("Flags: %s\n",
    3037           4 :                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
    3038             : 
    3039        6442 :                 if (p == le64toh(f->header->tail_object_offset))
    3040           9 :                         p = 0;
    3041             :                 else
    3042        6433 :                         p = p + ALIGN64(le64toh(o->object.size));
    3043             :         }
    3044             : 
    3045           9 :         return;
    3046           0 : fail:
    3047           0 :         log_error("File corrupt");
    3048             : }
    3049             : 
    3050          26 : static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
    3051             :         const char *x;
    3052             : 
    3053          26 :         x = format_timestamp(buf, l, t);
    3054          26 :         if (x)
    3055          18 :                 return x;
    3056           8 :         return " --- ";
    3057             : }
    3058             : 
    3059          13 : void journal_file_print_header(JournalFile *f) {
    3060             :         char a[33], b[33], c[33], d[33];
    3061             :         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
    3062             :         struct stat st;
    3063             :         char bytes[FORMAT_BYTES_MAX];
    3064             : 
    3065          13 :         assert(f);
    3066          13 :         assert(f->header);
    3067             : 
    3068         143 :         printf("File Path: %s\n"
    3069             :                "File ID: %s\n"
    3070             :                "Machine ID: %s\n"
    3071             :                "Boot ID: %s\n"
    3072             :                "Sequential Number ID: %s\n"
    3073             :                "State: %s\n"
    3074             :                "Compatible Flags:%s%s\n"
    3075             :                "Incompatible Flags:%s%s%s\n"
    3076             :                "Header size: %"PRIu64"\n"
    3077             :                "Arena size: %"PRIu64"\n"
    3078             :                "Data Hash Table Size: %"PRIu64"\n"
    3079             :                "Field Hash Table Size: %"PRIu64"\n"
    3080             :                "Rotate Suggested: %s\n"
    3081             :                "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
    3082             :                "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
    3083             :                "Head Realtime Timestamp: %s (%"PRIx64")\n"
    3084             :                "Tail Realtime Timestamp: %s (%"PRIx64")\n"
    3085             :                "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
    3086             :                "Objects: %"PRIu64"\n"
    3087             :                "Entry Objects: %"PRIu64"\n",
    3088             :                f->path,
    3089          13 :                sd_id128_to_string(f->header->file_id, a),
    3090          13 :                sd_id128_to_string(f->header->machine_id, b),
    3091          13 :                sd_id128_to_string(f->header->boot_id, c),
    3092          13 :                sd_id128_to_string(f->header->seqnum_id, d),
    3093          13 :                f->header->state == STATE_OFFLINE ? "OFFLINE" :
    3094          12 :                f->header->state == STATE_ONLINE ? "ONLINE" :
    3095           0 :                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
    3096          13 :                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
    3097          13 :                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
    3098          13 :                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
    3099          13 :                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
    3100          13 :                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
    3101          13 :                le64toh(f->header->header_size),
    3102          13 :                le64toh(f->header->arena_size),
    3103          13 :                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
    3104          13 :                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
    3105          13 :                yes_no(journal_file_rotate_suggested(f, 0)),
    3106          13 :                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
    3107          13 :                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
    3108          13 :                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
    3109          13 :                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
    3110          13 :                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
    3111          13 :                le64toh(f->header->n_objects),
    3112          13 :                le64toh(f->header->n_entries));
    3113             : 
    3114          13 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
    3115          13 :                 printf("Data Objects: %"PRIu64"\n"
    3116             :                        "Data Hash Table Fill: %.1f%%\n",
    3117          13 :                        le64toh(f->header->n_data),
    3118          13 :                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
    3119             : 
    3120          13 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
    3121          13 :                 printf("Field Objects: %"PRIu64"\n"
    3122             :                        "Field Hash Table Fill: %.1f%%\n",
    3123          13 :                        le64toh(f->header->n_fields),
    3124          13 :                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
    3125             : 
    3126          13 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
    3127          13 :                 printf("Tag Objects: %"PRIu64"\n",
    3128          13 :                        le64toh(f->header->n_tags));
    3129          13 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
    3130          13 :                 printf("Entry Array Objects: %"PRIu64"\n",
    3131          13 :                        le64toh(f->header->n_entry_arrays));
    3132             : 
    3133          13 :         if (fstat(f->fd, &st) >= 0)
    3134          13 :                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
    3135          13 : }
    3136             : 
    3137          25 : static int journal_file_warn_btrfs(JournalFile *f) {
    3138             :         unsigned attrs;
    3139             :         int r;
    3140             : 
    3141          25 :         assert(f);
    3142             : 
    3143             :         /* Before we write anything, check if the COW logic is turned
    3144             :          * off on btrfs. Given our write pattern that is quite
    3145             :          * unfriendly to COW file systems this should greatly improve
    3146             :          * performance on COW file systems, such as btrfs, at the
    3147             :          * expense of data integrity features (which shouldn't be too
    3148             :          * bad, given that we do our own checksumming). */
    3149             : 
    3150          25 :         r = btrfs_is_filesystem(f->fd);
    3151          25 :         if (r < 0)
    3152           0 :                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
    3153          25 :         if (!r)
    3154          25 :                 return 0;
    3155             : 
    3156           0 :         r = read_attr_fd(f->fd, &attrs);
    3157           0 :         if (r < 0)
    3158           0 :                 return log_warning_errno(r, "Failed to read file attributes: %m");
    3159             : 
    3160           0 :         if (attrs & FS_NOCOW_FL) {
    3161           0 :                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
    3162           0 :                 return 0;
    3163             :         }
    3164             : 
    3165           0 :         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
    3166             :                    "This is likely to slow down journal access substantially, please consider turning "
    3167             :                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
    3168             : 
    3169           0 :         return 1;
    3170             : }
    3171             : 
    3172        9843 : int journal_file_open(
    3173             :                 int fd,
    3174             :                 const char *fname,
    3175             :                 int flags,
    3176             :                 mode_t mode,
    3177             :                 bool compress,
    3178             :                 uint64_t compress_threshold_bytes,
    3179             :                 bool seal,
    3180             :                 JournalMetrics *metrics,
    3181             :                 MMapCache *mmap_cache,
    3182             :                 Set *deferred_closes,
    3183             :                 JournalFile *template,
    3184             :                 JournalFile **ret) {
    3185             : 
    3186        9843 :         bool newly_created = false;
    3187             :         JournalFile *f;
    3188             :         void *h;
    3189             :         int r;
    3190             : 
    3191        9843 :         assert(ret);
    3192        9843 :         assert(fd >= 0 || fname);
    3193             : 
    3194        9843 :         if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
    3195           0 :                 return -EINVAL;
    3196             : 
    3197        9843 :         if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
    3198           0 :                 return -EINVAL;
    3199             : 
    3200        9843 :         f = new(JournalFile, 1);
    3201        9843 :         if (!f)
    3202           0 :                 return -ENOMEM;
    3203             : 
    3204        9843 :         *f = (JournalFile) {
    3205             :                 .fd = fd,
    3206             :                 .mode = mode,
    3207             : 
    3208             :                 .flags = flags,
    3209        9843 :                 .prot = prot_from_flags(flags),
    3210        9843 :                 .writable = (flags & O_ACCMODE) != O_RDONLY,
    3211             : 
    3212             : #if HAVE_LZ4
    3213             :                 .compress_lz4 = compress,
    3214             : #elif HAVE_XZ
    3215             :                 .compress_xz = compress,
    3216             : #endif
    3217             :                 .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
    3218        9843 :                                             DEFAULT_COMPRESS_THRESHOLD :
    3219        9822 :                                             MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
    3220             : #if HAVE_GCRYPT
    3221             :                 .seal = seal,
    3222             : #endif
    3223             :         };
    3224             : 
    3225        9843 :         if (DEBUG_LOGGING) {
    3226             :                 static int last_seal = -1, last_compress = -1;
    3227             :                 static uint64_t last_bytes = UINT64_MAX;
    3228             :                 char bytes[FORMAT_BYTES_MAX];
    3229             : 
    3230        9741 :                 if (last_seal != f->seal ||
    3231        9732 :                     last_compress != JOURNAL_FILE_COMPRESS(f) ||
    3232        9725 :                     last_bytes != f->compress_threshold_bytes) {
    3233             : 
    3234          18 :                         log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
    3235             :                                   yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
    3236             :                                   format_bytes(bytes, sizeof bytes, f->compress_threshold_bytes));
    3237          18 :                         last_seal = f->seal;
    3238          18 :                         last_compress = JOURNAL_FILE_COMPRESS(f);
    3239          18 :                         last_bytes = f->compress_threshold_bytes;
    3240             :                 }
    3241             :         }
    3242             : 
    3243        9843 :         if (mmap_cache)
    3244        9818 :                 f->mmap = mmap_cache_ref(mmap_cache);
    3245             :         else {
    3246          25 :                 f->mmap = mmap_cache_new();
    3247          25 :                 if (!f->mmap) {
    3248           0 :                         r = -ENOMEM;
    3249           0 :                         goto fail;
    3250             :                 }
    3251             :         }
    3252             : 
    3253        9843 :         if (fname) {
    3254        9843 :                 f->path = strdup(fname);
    3255        9843 :                 if (!f->path) {
    3256           0 :                         r = -ENOMEM;
    3257           0 :                         goto fail;
    3258             :                 }
    3259             :         } else {
    3260           0 :                 assert(fd >= 0);
    3261             : 
    3262             :                 /* If we don't know the path, fill in something explanatory and vaguely useful */
    3263           0 :                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
    3264           0 :                         r = -ENOMEM;
    3265           0 :                         goto fail;
    3266             :                 }
    3267             :         }
    3268             : 
    3269        9843 :         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
    3270        9843 :         if (!f->chain_cache) {
    3271           0 :                 r = -ENOMEM;
    3272           0 :                 goto fail;
    3273             :         }
    3274             : 
    3275        9843 :         if (f->fd < 0) {
    3276             :                 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
    3277             :                  * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
    3278             :                  * it doesn't hurt in that case. */
    3279             : 
    3280          27 :                 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
    3281          27 :                 if (f->fd < 0) {
    3282           0 :                         r = -errno;
    3283           0 :                         goto fail;
    3284             :                 }
    3285             : 
    3286             :                 /* fds we opened here by us should also be closed by us. */
    3287          27 :                 f->close_fd = true;
    3288             : 
    3289          27 :                 r = fd_nonblock(f->fd, false);
    3290          27 :                 if (r < 0)
    3291           0 :                         goto fail;
    3292             :         }
    3293             : 
    3294        9843 :         f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
    3295        9843 :         if (!f->cache_fd) {
    3296           0 :                 r = -ENOMEM;
    3297           0 :                 goto fail;
    3298             :         }
    3299             : 
    3300        9843 :         r = journal_file_fstat(f);
    3301        9843 :         if (r < 0)
    3302           0 :                 goto fail;
    3303             : 
    3304        9843 :         if (f->last_stat.st_size == 0 && f->writable) {
    3305             : 
    3306          25 :                 (void) journal_file_warn_btrfs(f);
    3307             : 
    3308             :                 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
    3309             :                  * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
    3310             :                  * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
    3311             :                  * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
    3312             :                  * solely on mtime/atime/ctime of the file. */
    3313          25 :                 (void) fd_setcrtime(f->fd, 0);
    3314             : 
    3315             : #if HAVE_GCRYPT
    3316             :                 /* Try to load the FSPRG state, and if we can't, then
    3317             :                  * just don't do sealing */
    3318          25 :                 if (f->seal) {
    3319          12 :                         r = journal_file_fss_load(f);
    3320          12 :                         if (r < 0)
    3321          12 :                                 f->seal = false;
    3322             :                 }
    3323             : #endif
    3324             : 
    3325          25 :                 r = journal_file_init_header(f, template);
    3326          25 :                 if (r < 0)
    3327           0 :                         goto fail;
    3328             : 
    3329          25 :                 r = journal_file_fstat(f);
    3330          25 :                 if (r < 0)
    3331           0 :                         goto fail;
    3332             : 
    3333          25 :                 newly_created = true;
    3334             :         }
    3335             : 
    3336        9843 :         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
    3337           0 :                 r = -ENODATA;
    3338           0 :                 goto fail;
    3339             :         }
    3340             : 
    3341        9843 :         r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
    3342        9843 :         if (r == -EINVAL) {
    3343             :                 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
    3344             :                  * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
    3345             :                  * code. */
    3346           0 :                 r = -EAFNOSUPPORT;
    3347           0 :                 goto fail;
    3348             :         }
    3349        9843 :         if (r < 0)
    3350           0 :                 goto fail;
    3351             : 
    3352        9843 :         f->header = h;
    3353             : 
    3354        9843 :         if (!newly_created) {
    3355        9818 :                 set_clear_with_destructor(deferred_closes, journal_file_close);
    3356             : 
    3357        9818 :                 r = journal_file_verify_header(f);
    3358        9818 :                 if (r < 0)
    3359           0 :                         goto fail;
    3360             :         }
    3361             : 
    3362             : #if HAVE_GCRYPT
    3363        9843 :         if (!newly_created && f->writable) {
    3364           1 :                 r = journal_file_fss_load(f);
    3365           1 :                 if (r < 0)
    3366           0 :                         goto fail;
    3367             :         }
    3368             : #endif
    3369             : 
    3370        9843 :         if (f->writable) {
    3371          26 :                 if (metrics) {
    3372           0 :                         journal_default_metrics(metrics, f->fd);
    3373           0 :                         f->metrics = *metrics;
    3374          26 :                 } else if (template)
    3375           3 :                         f->metrics = template->metrics;
    3376             : 
    3377          26 :                 r = journal_file_refresh_header(f);
    3378          26 :                 if (r < 0)
    3379           0 :                         goto fail;
    3380             :         }
    3381             : 
    3382             : #if HAVE_GCRYPT
    3383        9843 :         r = journal_file_hmac_setup(f);
    3384        9843 :         if (r < 0)
    3385           0 :                 goto fail;
    3386             : #endif
    3387             : 
    3388        9843 :         if (newly_created) {
    3389          25 :                 r = journal_file_setup_field_hash_table(f);
    3390          25 :                 if (r < 0)
    3391           0 :                         goto fail;
    3392             : 
    3393          25 :                 r = journal_file_setup_data_hash_table(f);
    3394          25 :                 if (r < 0)
    3395           0 :                         goto fail;
    3396             : 
    3397             : #if HAVE_GCRYPT
    3398          25 :                 r = journal_file_append_first_tag(f);
    3399          25 :                 if (r < 0)
    3400           0 :                         goto fail;
    3401             : #endif
    3402             :         }
    3403             : 
    3404        9843 :         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
    3405           0 :                 r = -EIO;
    3406           0 :                 goto fail;
    3407             :         }
    3408             : 
    3409        9843 :         if (template && template->post_change_timer) {
    3410           0 :                 r = journal_file_enable_post_change_timer(
    3411             :                                 f,
    3412             :                                 sd_event_source_get_event(template->post_change_timer),
    3413             :                                 template->post_change_timer_period);
    3414             : 
    3415           0 :                 if (r < 0)
    3416           0 :                         goto fail;
    3417             :         }
    3418             : 
    3419             :         /* The file is opened now successfully, thus we take possession of any passed in fd. */
    3420        9843 :         f->close_fd = true;
    3421             : 
    3422        9843 :         *ret = f;
    3423        9843 :         return 0;
    3424             : 
    3425           0 : fail:
    3426           0 :         if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
    3427           0 :                 r = -EIO;
    3428             : 
    3429           0 :         (void) journal_file_close(f);
    3430             : 
    3431           0 :         return r;
    3432             : }
    3433             : 
    3434           2 : int journal_file_archive(JournalFile *f) {
    3435           2 :         _cleanup_free_ char *p = NULL;
    3436             : 
    3437           2 :         assert(f);
    3438             : 
    3439           2 :         if (!f->writable)
    3440           0 :                 return -EINVAL;
    3441             : 
    3442             :         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
    3443             :          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
    3444           2 :         if (path_startswith(f->path, "/proc/self/fd"))
    3445           0 :                 return -EINVAL;
    3446             : 
    3447           2 :         if (!endswith(f->path, ".journal"))
    3448           0 :                 return -EINVAL;
    3449             : 
    3450           2 :         if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
    3451           2 :                      (int) strlen(f->path) - 8, f->path,
    3452           2 :                      SD_ID128_FORMAT_VAL(f->header->seqnum_id),
    3453           2 :                      le64toh(f->header->head_entry_seqnum),
    3454           2 :                      le64toh(f->header->head_entry_realtime)) < 0)
    3455           0 :                 return -ENOMEM;
    3456             : 
    3457             :         /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
    3458             :          * ignore that case. */
    3459           2 :         if (rename(f->path, p) < 0 && errno != ENOENT)
    3460           0 :                 return -errno;
    3461             : 
    3462             :         /* Sync the rename to disk */
    3463           2 :         (void) fsync_directory_of_file(f->fd);
    3464             : 
    3465             :         /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
    3466             :          * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
    3467             :          * which would result in the rotated journal never getting fsync() called before closing.  Now we simply queue
    3468             :          * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
    3469             :          * occurs. */
    3470           2 :         f->archive = true;
    3471             : 
    3472             :         /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
    3473             :          * files when we archive them */
    3474           2 :         f->defrag_on_close = true;
    3475             : 
    3476           2 :         return 0;
    3477             : }
    3478             : 
    3479           2 : JournalFile* journal_initiate_close(
    3480             :                 JournalFile *f,
    3481             :                 Set *deferred_closes) {
    3482             : 
    3483             :         int r;
    3484             : 
    3485           2 :         assert(f);
    3486             : 
    3487           2 :         if (deferred_closes) {
    3488             : 
    3489           0 :                 r = set_put(deferred_closes, f);
    3490           0 :                 if (r < 0)
    3491           0 :                         log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
    3492             :                 else {
    3493           0 :                         (void) journal_file_set_offline(f, false);
    3494           0 :                         return NULL;
    3495             :                 }
    3496             :         }
    3497             : 
    3498           2 :         return journal_file_close(f);
    3499             : }
    3500             : 
    3501           2 : int journal_file_rotate(
    3502             :                 JournalFile **f,
    3503             :                 bool compress,
    3504             :                 uint64_t compress_threshold_bytes,
    3505             :                 bool seal,
    3506             :                 Set *deferred_closes) {
    3507             : 
    3508           2 :         JournalFile *new_file = NULL;
    3509             :         int r;
    3510             : 
    3511           2 :         assert(f);
    3512           2 :         assert(*f);
    3513             : 
    3514           2 :         r = journal_file_archive(*f);
    3515           2 :         if (r < 0)
    3516           0 :                 return r;
    3517             : 
    3518          10 :         r = journal_file_open(
    3519             :                         -1,
    3520           2 :                         (*f)->path,
    3521           2 :                         (*f)->flags,
    3522           2 :                         (*f)->mode,
    3523             :                         compress,
    3524             :                         compress_threshold_bytes,
    3525             :                         seal,
    3526             :                         NULL,            /* metrics */
    3527           2 :                         (*f)->mmap,
    3528             :                         deferred_closes,
    3529             :                         *f,              /* template */
    3530             :                         &new_file);
    3531             : 
    3532           2 :         journal_initiate_close(*f, deferred_closes);
    3533           2 :         *f = new_file;
    3534             : 
    3535           2 :         return r;
    3536             : }
    3537             : 
    3538           0 : int journal_file_dispose(int dir_fd, const char *fname) {
    3539           0 :         _cleanup_free_ char *p = NULL;
    3540           0 :         _cleanup_close_ int fd = -1;
    3541             : 
    3542           0 :         assert(fname);
    3543             : 
    3544             :         /* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that
    3545             :          * this is done without looking into the file or changing any of its contents. The idea is that this is called
    3546             :          * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
    3547             :          * for writing anymore. */
    3548             : 
    3549           0 :         if (!endswith(fname, ".journal"))
    3550           0 :                 return -EINVAL;
    3551             : 
    3552           0 :         if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
    3553           0 :                      (int) strlen(fname) - 8, fname,
    3554             :                      now(CLOCK_REALTIME),
    3555             :                      random_u64()) < 0)
    3556           0 :                 return -ENOMEM;
    3557             : 
    3558           0 :         if (renameat(dir_fd, fname, dir_fd, p) < 0)
    3559           0 :                 return -errno;
    3560             : 
    3561             :         /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
    3562           0 :         fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
    3563           0 :         if (fd < 0)
    3564           0 :                 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
    3565             :         else {
    3566           0 :                 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
    3567           0 :                 (void) btrfs_defrag_fd(fd);
    3568             :         }
    3569             : 
    3570           0 :         return 0;
    3571             : }
    3572             : 
    3573           0 : int journal_file_open_reliably(
    3574             :                 const char *fname,
    3575             :                 int flags,
    3576             :                 mode_t mode,
    3577             :                 bool compress,
    3578             :                 uint64_t compress_threshold_bytes,
    3579             :                 bool seal,
    3580             :                 JournalMetrics *metrics,
    3581             :                 MMapCache *mmap_cache,
    3582             :                 Set *deferred_closes,
    3583             :                 JournalFile *template,
    3584             :                 JournalFile **ret) {
    3585             : 
    3586             :         int r;
    3587             : 
    3588           0 :         r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
    3589             :                               deferred_closes, template, ret);
    3590           0 :         if (!IN_SET(r,
    3591             :                     -EBADMSG,           /* Corrupted */
    3592             :                     -ENODATA,           /* Truncated */
    3593             :                     -EHOSTDOWN,         /* Other machine */
    3594             :                     -EPROTONOSUPPORT,   /* Incompatible feature */
    3595             :                     -EBUSY,             /* Unclean shutdown */
    3596             :                     -ESHUTDOWN,         /* Already archived */
    3597             :                     -EIO,               /* IO error, including SIGBUS on mmap */
    3598             :                     -EIDRM,             /* File has been deleted */
    3599             :                     -ETXTBSY))          /* File is from the future */
    3600           0 :                 return r;
    3601             : 
    3602           0 :         if ((flags & O_ACCMODE) == O_RDONLY)
    3603           0 :                 return r;
    3604             : 
    3605           0 :         if (!(flags & O_CREAT))
    3606           0 :                 return r;
    3607             : 
    3608           0 :         if (!endswith(fname, ".journal"))
    3609           0 :                 return r;
    3610             : 
    3611             :         /* The file is corrupted. Rotate it away and try it again (but only once) */
    3612           0 :         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
    3613             : 
    3614           0 :         r = journal_file_dispose(AT_FDCWD, fname);
    3615           0 :         if (r < 0)
    3616           0 :                 return r;
    3617             : 
    3618           0 :         return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
    3619             :                                  deferred_closes, template, ret);
    3620             : }
    3621             : 
    3622       10000 : int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
    3623             :         uint64_t i, n;
    3624       10000 :         uint64_t q, xor_hash = 0;
    3625             :         int r;
    3626             :         EntryItem *items;
    3627             :         dual_timestamp ts;
    3628             :         const sd_id128_t *boot_id;
    3629             : 
    3630       10000 :         assert(from);
    3631       10000 :         assert(to);
    3632       10000 :         assert(o);
    3633       10000 :         assert(p);
    3634             : 
    3635       10000 :         if (!to->writable)
    3636           0 :                 return -EPERM;
    3637             : 
    3638       10000 :         ts.monotonic = le64toh(o->entry.monotonic);
    3639       10000 :         ts.realtime = le64toh(o->entry.realtime);
    3640       10000 :         boot_id = &o->entry.boot_id;
    3641             : 
    3642       10000 :         n = journal_file_entry_n_items(o);
    3643             :         /* alloca() can't take 0, hence let's allocate at least one */
    3644       10000 :         items = newa(EntryItem, MAX(1u, n));
    3645             : 
    3646      235468 :         for (i = 0; i < n; i++) {
    3647             :                 uint64_t l, h;
    3648             :                 le64_t le_hash;
    3649             :                 size_t t;
    3650             :                 void *data;
    3651             :                 Object *u;
    3652             : 
    3653      225468 :                 q = le64toh(o->entry.items[i].object_offset);
    3654      225468 :                 le_hash = o->entry.items[i].hash;
    3655             : 
    3656      225468 :                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
    3657      225468 :                 if (r < 0)
    3658           0 :                         return r;
    3659             : 
    3660      225468 :                 if (le_hash != o->data.hash)
    3661           0 :                         return -EBADMSG;
    3662             : 
    3663      225468 :                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
    3664      225468 :                 t = (size_t) l;
    3665             : 
    3666             :                 /* We hit the limit on 32bit machines */
    3667      225468 :                 if ((uint64_t) t != l)
    3668           0 :                         return -E2BIG;
    3669             : 
    3670      225468 :                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
    3671             : #if HAVE_XZ || HAVE_LZ4
    3672          34 :                         size_t rsize = 0;
    3673             : 
    3674          68 :                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
    3675          34 :                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
    3676          34 :                         if (r < 0)
    3677           0 :                                 return r;
    3678             : 
    3679          34 :                         data = from->compress_buffer;
    3680          34 :                         l = rsize;
    3681             : #else
    3682             :                         return -EPROTONOSUPPORT;
    3683             : #endif
    3684             :                 } else
    3685      225434 :                         data = o->data.payload;
    3686             : 
    3687      225468 :                 r = journal_file_append_data(to, data, l, &u, &h);
    3688      225468 :                 if (r < 0)
    3689           0 :                         return r;
    3690             : 
    3691      225468 :                 xor_hash ^= le64toh(u->data.hash);
    3692      225468 :                 items[i].object_offset = htole64(h);
    3693      225468 :                 items[i].hash = u->data.hash;
    3694             : 
    3695      225468 :                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
    3696      225468 :                 if (r < 0)
    3697           0 :                         return r;
    3698             :         }
    3699             : 
    3700       10000 :         r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
    3701             :                                                NULL, NULL, NULL);
    3702             : 
    3703       10000 :         if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
    3704           0 :                 return -EIO;
    3705             : 
    3706       10000 :         return r;
    3707             : }
    3708             : 
    3709           0 : void journal_reset_metrics(JournalMetrics *m) {
    3710           0 :         assert(m);
    3711             : 
    3712             :         /* Set everything to "pick automatic values". */
    3713             : 
    3714           0 :         *m = (JournalMetrics) {
    3715             :                 .min_use = (uint64_t) -1,
    3716             :                 .max_use = (uint64_t) -1,
    3717             :                 .min_size = (uint64_t) -1,
    3718             :                 .max_size = (uint64_t) -1,
    3719             :                 .keep_free = (uint64_t) -1,
    3720             :                 .n_max_files = (uint64_t) -1,
    3721             :         };
    3722           0 : }
    3723             : 
    3724           0 : void journal_default_metrics(JournalMetrics *m, int fd) {
    3725             :         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
    3726             :         struct statvfs ss;
    3727           0 :         uint64_t fs_size = 0;
    3728             : 
    3729           0 :         assert(m);
    3730           0 :         assert(fd >= 0);
    3731             : 
    3732           0 :         if (fstatvfs(fd, &ss) >= 0)
    3733           0 :                 fs_size = ss.f_frsize * ss.f_blocks;
    3734             :         else
    3735           0 :                 log_debug_errno(errno, "Failed to determine disk size: %m");
    3736             : 
    3737           0 :         if (m->max_use == (uint64_t) -1) {
    3738             : 
    3739           0 :                 if (fs_size > 0)
    3740           0 :                         m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
    3741             :                                            MAX_USE_LOWER, MAX_USE_UPPER);
    3742             :                 else
    3743           0 :                         m->max_use = MAX_USE_LOWER;
    3744             :         } else {
    3745           0 :                 m->max_use = PAGE_ALIGN(m->max_use);
    3746             : 
    3747           0 :                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
    3748           0 :                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
    3749             :         }
    3750             : 
    3751           0 :         if (m->min_use == (uint64_t) -1) {
    3752           0 :                 if (fs_size > 0)
    3753           0 :                         m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
    3754             :                                            MIN_USE_LOW, MIN_USE_HIGH);
    3755             :                 else
    3756           0 :                         m->min_use = MIN_USE_LOW;
    3757             :         }
    3758             : 
    3759           0 :         if (m->min_use > m->max_use)
    3760           0 :                 m->min_use = m->max_use;
    3761             : 
    3762           0 :         if (m->max_size == (uint64_t) -1)
    3763           0 :                 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
    3764             :                                   MAX_SIZE_UPPER);
    3765             :         else
    3766           0 :                 m->max_size = PAGE_ALIGN(m->max_size);
    3767             : 
    3768           0 :         if (m->max_size != 0) {
    3769           0 :                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
    3770           0 :                         m->max_size = JOURNAL_FILE_SIZE_MIN;
    3771             : 
    3772           0 :                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
    3773           0 :                         m->max_use = m->max_size*2;
    3774             :         }
    3775             : 
    3776           0 :         if (m->min_size == (uint64_t) -1)
    3777           0 :                 m->min_size = JOURNAL_FILE_SIZE_MIN;
    3778             :         else
    3779           0 :                 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
    3780             :                                     JOURNAL_FILE_SIZE_MIN,
    3781             :                                     m->max_size ?: UINT64_MAX);
    3782             : 
    3783           0 :         if (m->keep_free == (uint64_t) -1) {
    3784           0 :                 if (fs_size > 0)
    3785           0 :                         m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
    3786             :                                            KEEP_FREE_UPPER);
    3787             :                 else
    3788           0 :                         m->keep_free = DEFAULT_KEEP_FREE;
    3789             :         }
    3790             : 
    3791           0 :         if (m->n_max_files == (uint64_t) -1)
    3792           0 :                 m->n_max_files = DEFAULT_N_MAX_FILES;
    3793             : 
    3794           0 :         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
    3795             :                   format_bytes(a, sizeof(a), m->min_use),
    3796             :                   format_bytes(b, sizeof(b), m->max_use),
    3797             :                   format_bytes(c, sizeof(c), m->max_size),
    3798             :                   format_bytes(d, sizeof(d), m->min_size),
    3799             :                   format_bytes(e, sizeof(e), m->keep_free),
    3800             :                   m->n_max_files);
    3801           0 : }
    3802             : 
    3803           0 : int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
    3804           0 :         assert(f);
    3805           0 :         assert(f->header);
    3806           0 :         assert(from || to);
    3807             : 
    3808           0 :         if (from) {
    3809           0 :                 if (f->header->head_entry_realtime == 0)
    3810           0 :                         return -ENOENT;
    3811             : 
    3812           0 :                 *from = le64toh(f->header->head_entry_realtime);
    3813             :         }
    3814             : 
    3815           0 :         if (to) {
    3816           0 :                 if (f->header->tail_entry_realtime == 0)
    3817           0 :                         return -ENOENT;
    3818             : 
    3819           0 :                 *to = le64toh(f->header->tail_entry_realtime);
    3820             :         }
    3821             : 
    3822           0 :         return 1;
    3823             : }
    3824             : 
    3825           0 : int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
    3826             :         Object *o;
    3827             :         uint64_t p;
    3828             :         int r;
    3829             : 
    3830           0 :         assert(f);
    3831           0 :         assert(from || to);
    3832             : 
    3833           0 :         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
    3834           0 :         if (r <= 0)
    3835           0 :                 return r;
    3836             : 
    3837           0 :         if (le64toh(o->data.n_entries) <= 0)
    3838           0 :                 return 0;
    3839             : 
    3840           0 :         if (from) {
    3841           0 :                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
    3842           0 :                 if (r < 0)
    3843           0 :                         return r;
    3844             : 
    3845           0 :                 *from = le64toh(o->entry.monotonic);
    3846             :         }
    3847             : 
    3848           0 :         if (to) {
    3849           0 :                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
    3850           0 :                 if (r < 0)
    3851           0 :                         return r;
    3852             : 
    3853           0 :                 r = generic_array_get_plus_one(f,
    3854           0 :                                                le64toh(o->data.entry_offset),
    3855           0 :                                                le64toh(o->data.entry_array_offset),
    3856           0 :                                                le64toh(o->data.n_entries)-1,
    3857             :                                                &o, NULL);
    3858           0 :                 if (r <= 0)
    3859           0 :                         return r;
    3860             : 
    3861           0 :                 *to = le64toh(o->entry.monotonic);
    3862             :         }
    3863             : 
    3864           0 :         return 1;
    3865             : }
    3866             : 
    3867          13 : bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
    3868          13 :         assert(f);
    3869          13 :         assert(f->header);
    3870             : 
    3871             :         /* If we gained new header fields we gained new features,
    3872             :          * hence suggest a rotation */
    3873          13 :         if (le64toh(f->header->header_size) < sizeof(Header)) {
    3874           0 :                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
    3875           0 :                 return true;
    3876             :         }
    3877             : 
    3878             :         /* Let's check if the hash tables grew over a certain fill
    3879             :          * level (75%, borrowing this value from Java's hash table
    3880             :          * implementation), and if so suggest a rotation. To calculate
    3881             :          * the fill level we need the n_data field, which only exists
    3882             :          * in newer versions. */
    3883             : 
    3884          13 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
    3885          13 :                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
    3886           0 :                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
    3887             :                                   f->path,
    3888             :                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
    3889             :                                   le64toh(f->header->n_data),
    3890             :                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
    3891             :                                   (unsigned long long) f->last_stat.st_size,
    3892             :                                   f->last_stat.st_size / le64toh(f->header->n_data));
    3893           0 :                         return true;
    3894             :                 }
    3895             : 
    3896          13 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
    3897          13 :                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
    3898           0 :                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
    3899             :                                   f->path,
    3900             :                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
    3901             :                                   le64toh(f->header->n_fields),
    3902             :                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
    3903           0 :                         return true;
    3904             :                 }
    3905             : 
    3906             :         /* Are the data objects properly indexed by field objects? */
    3907          26 :         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
    3908          26 :             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
    3909          22 :             le64toh(f->header->n_data) > 0 &&
    3910           9 :             le64toh(f->header->n_fields) == 0)
    3911           7 :                 return true;
    3912             : 
    3913           6 :         if (max_file_usec > 0) {
    3914             :                 usec_t t, h;
    3915             : 
    3916           0 :                 h = le64toh(f->header->head_entry_realtime);
    3917           0 :                 t = now(CLOCK_REALTIME);
    3918             : 
    3919           0 :                 if (h > 0 && t > h + max_file_usec)
    3920           0 :                         return true;
    3921             :         }
    3922             : 
    3923           6 :         return false;
    3924             : }

Generated by: LCOV version 1.14