fs/xfs/xfs_log_priv.h

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
0004  * All Rights Reserved.
0005  */
0006 #ifndef __XFS_LOG_PRIV_H__
0007 #define __XFS_LOG_PRIV_H__
0008
0009 struct xfs_buf;
0010 struct xlog;
0011 struct xlog_ticket;
0012 struct xfs_mount;
0013
0014 /*
0015  * get client id from packed copy.
0016  *
0017  * this hack is here because the xlog_pack code copies four bytes
0018  * of xlog_op_header containing the fields oh_clientid, oh_flags
0019  * and oh_res2 into the packed copy.
0020  *
0021  * later on this four byte chunk is treated as an int and the
0022  * client id is pulled out.
0023  *
0024  * this has endian issues, of course.
0025  */
0026 static inline uint xlog_get_client_id(__be32 i)
0027 {
0028     return be32_to_cpu(i) >> 24;
0029 }
0030
0031 /*
0032  * In core log state
0033  */
0034 enum xlog_iclog_state {
0035     XLOG_STATE_ACTIVE,  /* Current IC log being written to */
0036     XLOG_STATE_WANT_SYNC,   /* Want to sync this iclog; no more writes */
0037     XLOG_STATE_SYNCING, /* This IC log is syncing */
0038     XLOG_STATE_DONE_SYNC,   /* Done syncing to disk */
0039     XLOG_STATE_CALLBACK,    /* Callback functions now */
0040     XLOG_STATE_DIRTY,   /* Dirty IC log, not ready for ACTIVE status */
0041 };
0042
0043 #define XLOG_STATE_STRINGS \
0044     { XLOG_STATE_ACTIVE,    "XLOG_STATE_ACTIVE" }, \
0045     { XLOG_STATE_WANT_SYNC, "XLOG_STATE_WANT_SYNC" }, \
0046     { XLOG_STATE_SYNCING,   "XLOG_STATE_SYNCING" }, \
0047     { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \
0048     { XLOG_STATE_CALLBACK,  "XLOG_STATE_CALLBACK" }, \
0049     { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }
0050
0051 /*
0052  * In core log flags
0053  */
0054 #define XLOG_ICL_NEED_FLUSH (1u << 0)   /* iclog needs REQ_PREFLUSH */
0055 #define XLOG_ICL_NEED_FUA   (1u << 1)   /* iclog needs REQ_FUA */
0056
0057 #define XLOG_ICL_STRINGS \
0058     { XLOG_ICL_NEED_FLUSH,  "XLOG_ICL_NEED_FLUSH" }, \
0059     { XLOG_ICL_NEED_FUA,    "XLOG_ICL_NEED_FUA" }
0060
0061
0062 /*
0063  * Log ticket flags
0064  */
0065 #define XLOG_TIC_PERM_RESERV    (1u << 0)   /* permanent reservation */
0066
0067 #define XLOG_TIC_FLAGS \
0068     { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
0069
0070 /*
0071  * Below are states for covering allocation transactions.
0072  * By covering, we mean changing the h_tail_lsn in the last on-disk
0073  * log write such that no allocation transactions will be re-done during
0074  * recovery after a system crash. Recovery starts at the last on-disk
0075  * log write.
0076  *
0077  * These states are used to insert dummy log entries to cover
0078  * space allocation transactions which can undo non-transactional changes
0079  * after a crash. Writes to a file with space
0080  * already allocated do not result in any transactions. Allocations
0081  * might include space beyond the EOF. So if we just push the EOF a
0082  * little, the last transaction for the file could contain the wrong
0083  * size. If there is no file system activity, after an allocation
0084  * transaction, and the system crashes, the allocation transaction
0085  * will get replayed and the file will be truncated. This could
0086  * be hours/days/... after the allocation occurred.
0087  *
0088  * The fix for this is to do two dummy transactions when the
0089  * system is idle. We need two dummy transaction because the h_tail_lsn
0090  * in the log record header needs to point beyond the last possible
0091  * non-dummy transaction. The first dummy changes the h_tail_lsn to
0092  * the first transaction before the dummy. The second dummy causes
0093  * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
0094  *
0095  * These dummy transactions get committed when everything
0096  * is idle (after there has been some activity).
0097  *
0098  * There are 5 states used to control this.
0099  *
0100  *  IDLE -- no logging has been done on the file system or
0101  *      we are done covering previous transactions.
0102  *  NEED -- logging has occurred and we need a dummy transaction
0103  *      when the log becomes idle.
0104  *  DONE -- we were in the NEED state and have committed a dummy
0105  *      transaction.
0106  *  NEED2 -- we detected that a dummy transaction has gone to the
0107  *      on disk log with no other transactions.
0108  *  DONE2 -- we committed a dummy transaction when in the NEED2 state.
0109  *
0110  * There are two places where we switch states:
0111  *
0112  * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
0113  *  We commit the dummy transaction and switch to DONE or DONE2,
0114  *  respectively. In all other states, we don't do anything.
0115  *
0116  * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
0117  *
0118  *  No matter what state we are in, if this isn't the dummy
0119  *  transaction going out, the next state is NEED.
0120  *  So, if we aren't in the DONE or DONE2 states, the next state
0121  *  is NEED. We can't be finishing a write of the dummy record
0122  *  unless it was committed and the state switched to DONE or DONE2.
0123  *
0124  *  If we are in the DONE state and this was a write of the
0125  *      dummy transaction, we move to NEED2.
0126  *
0127  *  If we are in the DONE2 state and this was a write of the
0128  *      dummy transaction, we move to IDLE.
0129  *
0130  *
0131  * Writing only one dummy transaction can get appended to
0132  * one file space allocation. When this happens, the log recovery
0133  * code replays the space allocation and a file could be truncated.
0134  * This is why we have the NEED2 and DONE2 states before going idle.
0135  */
0136
0137 #define XLOG_STATE_COVER_IDLE   0
0138 #define XLOG_STATE_COVER_NEED   1
0139 #define XLOG_STATE_COVER_DONE   2
0140 #define XLOG_STATE_COVER_NEED2  3
0141 #define XLOG_STATE_COVER_DONE2  4
0142
0143 #define XLOG_COVER_OPS      5
0144
0145 typedef struct xlog_ticket {
0146     struct list_head    t_queue;    /* reserve/write queue */
0147     struct task_struct  *t_task;    /* task that owns this ticket */
0148     xlog_tid_t      t_tid;      /* transaction identifier */
0149     atomic_t        t_ref;      /* ticket reference count */
0150     int         t_curr_res; /* current reservation */
0151     int         t_unit_res; /* unit reservation */
0152     char            t_ocnt;     /* original unit count */
0153     char            t_cnt;      /* current unit count */
0154     uint8_t         t_flags;    /* properties of reservation */
0155     int         t_iclog_hdrs;   /* iclog hdrs in t_curr_res */
0156 } xlog_ticket_t;
0157
0158 /*
0159  * - A log record header is 512 bytes.  There is plenty of room to grow the
0160  *  xlog_rec_header_t into the reserved space.
0161  * - ic_data follows, so a write to disk can start at the beginning of
0162  *  the iclog.
0163  * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
0164  * - ic_next is the pointer to the next iclog in the ring.
0165  * - ic_log is a pointer back to the global log structure.
0166  * - ic_size is the full size of the log buffer, minus the cycle headers.
0167  * - ic_offset is the current number of bytes written to in this iclog.
0168  * - ic_refcnt is bumped when someone is writing to the log.
0169  * - ic_state is the state of the iclog.
0170  *
0171  * Because of cacheline contention on large machines, we need to separate
0172  * various resources onto different cachelines. To start with, make the
0173  * structure cacheline aligned. The following fields can be contended on
0174  * by independent processes:
0175  *
0176  *  - ic_callbacks
0177  *  - ic_refcnt
0178  *  - fields protected by the global l_icloglock
0179  *
0180  * so we need to ensure that these fields are located in separate cachelines.
0181  * We'll put all the read-only and l_icloglock fields in the first cacheline,
0182  * and move everything else out to subsequent cachelines.
0183  */
0184 typedef struct xlog_in_core {
0185     wait_queue_head_t   ic_force_wait;
0186     wait_queue_head_t   ic_write_wait;
0187     struct xlog_in_core *ic_next;
0188     struct xlog_in_core *ic_prev;
0189     struct xlog     *ic_log;
0190     u32         ic_size;
0191     u32         ic_offset;
0192     enum xlog_iclog_state   ic_state;
0193     unsigned int        ic_flags;
0194     void            *ic_datap;  /* pointer to iclog data */
0195     struct list_head    ic_callbacks;
0196
0197     /* reference counts need their own cacheline */
0198     atomic_t        ic_refcnt ____cacheline_aligned_in_smp;
0199     xlog_in_core_2_t    *ic_data;
0200 #define ic_header   ic_data->hic_header
0201 #ifdef DEBUG
0202     bool            ic_fail_crc : 1;
0203 #endif
0204     struct semaphore    ic_sema;
0205     struct work_struct  ic_end_io_work;
0206     struct bio      ic_bio;
0207     struct bio_vec      ic_bvec[];
0208 } xlog_in_core_t;
0209
0210 /*
0211  * The CIL context is used to aggregate per-transaction details as well be
0212  * passed to the iclog for checkpoint post-commit processing.  After being
0213  * passed to the iclog, another context needs to be allocated for tracking the
0214  * next set of transactions to be aggregated into a checkpoint.
0215  */
0216 struct xfs_cil;
0217
0218 struct xfs_cil_ctx {
0219     struct xfs_cil      *cil;
0220     xfs_csn_t       sequence;   /* chkpt sequence # */
0221     xfs_lsn_t       start_lsn;  /* first LSN of chkpt commit */
0222     xfs_lsn_t       commit_lsn; /* chkpt commit record lsn */
0223     struct xlog_in_core *commit_iclog;
0224     struct xlog_ticket  *ticket;    /* chkpt ticket */
0225     atomic_t        space_used; /* aggregate size of regions */
0226     struct list_head    busy_extents;   /* busy extents in chkpt */
0227     struct list_head    log_items;  /* log items in chkpt */
0228     struct list_head    lv_chain;   /* logvecs being pushed */
0229     struct list_head    iclog_entry;
0230     struct list_head    committing; /* ctx committing list */
0231     struct work_struct  discard_endio_work;
0232     struct work_struct  push_work;
0233     atomic_t        order_id;
0234 };
0235
0236 /*
0237  * Per-cpu CIL tracking items
0238  */
0239 struct xlog_cil_pcp {
0240     int32_t         space_used;
0241     uint32_t        space_reserved;
0242     struct list_head    busy_extents;
0243     struct list_head    log_items;
0244 };
0245
0246 /*
0247  * Committed Item List structure
0248  *
0249  * This structure is used to track log items that have been committed but not
0250  * yet written into the log. It is used only when the delayed logging mount
0251  * option is enabled.
0252  *
0253  * This structure tracks the list of committing checkpoint contexts so
0254  * we can avoid the problem of having to hold out new transactions during a
0255  * flush until we have a the commit record LSN of the checkpoint. We can
0256  * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
0257  * sequence match and extract the commit LSN directly from there. If the
0258  * checkpoint is still in the process of committing, we can block waiting for
0259  * the commit LSN to be determined as well. This should make synchronous
0260  * operations almost as efficient as the old logging methods.
0261  */
0262 struct xfs_cil {
0263     struct xlog     *xc_log;
0264     unsigned long       xc_flags;
0265     atomic_t        xc_iclog_hdrs;
0266     struct workqueue_struct *xc_push_wq;
0267
0268     struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
0269     struct xfs_cil_ctx  *xc_ctx;
0270
0271     spinlock_t      xc_push_lock ____cacheline_aligned_in_smp;
0272     xfs_csn_t       xc_push_seq;
0273     bool            xc_push_commit_stable;
0274     struct list_head    xc_committing;
0275     wait_queue_head_t   xc_commit_wait;
0276     wait_queue_head_t   xc_start_wait;
0277     xfs_csn_t       xc_current_sequence;
0278     wait_queue_head_t   xc_push_wait;   /* background push throttle */
0279
0280     void __percpu       *xc_pcp;    /* percpu CIL structures */
0281 #ifdef CONFIG_HOTPLUG_CPU
0282     struct list_head    xc_pcp_list;
0283 #endif
0284 } ____cacheline_aligned_in_smp;
0285
0286 /* xc_flags bit values */
0287 #define XLOG_CIL_EMPTY      1
0288 #define XLOG_CIL_PCP_SPACE  2
0289
0290 /*
0291  * The amount of log space we allow the CIL to aggregate is difficult to size.
0292  * Whatever we choose, we have to make sure we can get a reservation for the
0293  * log space effectively, that it is large enough to capture sufficient
0294  * relogging to reduce log buffer IO significantly, but it is not too large for
0295  * the log or induces too much latency when writing out through the iclogs. We
0296  * track both space consumed and the number of vectors in the checkpoint
0297  * context, so we need to decide which to use for limiting.
0298  *
0299  * Every log buffer we write out during a push needs a header reserved, which
0300  * is at least one sector and more for v2 logs. Hence we need a reservation of
0301  * at least 512 bytes per 32k of log space just for the LR headers. That means
0302  * 16KB of reservation per megabyte of delayed logging space we will consume,
0303  * plus various headers.  The number of headers will vary based on the num of
0304  * io vectors, so limiting on a specific number of vectors is going to result
0305  * in transactions of varying size. IOWs, it is more consistent to track and
0306  * limit space consumed in the log rather than by the number of objects being
0307  * logged in order to prevent checkpoint ticket overruns.
0308  *
0309  * Further, use of static reservations through the log grant mechanism is
0310  * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
0311  * grant) and a significant deadlock potential because regranting write space
0312  * can block on log pushes. Hence if we have to regrant log space during a log
0313  * push, we can deadlock.
0314  *
0315  * However, we can avoid this by use of a dynamic "reservation stealing"
0316  * technique during transaction commit whereby unused reservation space in the
0317  * transaction ticket is transferred to the CIL ctx commit ticket to cover the
0318  * space needed by the checkpoint transaction. This means that we never need to
0319  * specifically reserve space for the CIL checkpoint transaction, nor do we
0320  * need to regrant space once the checkpoint completes. This also means the
0321  * checkpoint transaction ticket is specific to the checkpoint context, rather
0322  * than the CIL itself.
0323  *
0324  * With dynamic reservations, we can effectively make up arbitrary limits for
0325  * the checkpoint size so long as they don't violate any other size rules.
0326  * Recovery imposes a rule that no transaction exceed half the log, so we are
0327  * limited by that.  Furthermore, the log transaction reservation subsystem
0328  * tries to keep 25% of the log free, so we need to keep below that limit or we
0329  * risk running out of free log space to start any new transactions.
0330  *
0331  * In order to keep background CIL push efficient, we only need to ensure the
0332  * CIL is large enough to maintain sufficient in-memory relogging to avoid
0333  * repeated physical writes of frequently modified metadata. If we allow the CIL
0334  * to grow to a substantial fraction of the log, then we may be pinning hundreds
0335  * of megabytes of metadata in memory until the CIL flushes. This can cause
0336  * issues when we are running low on memory - pinned memory cannot be reclaimed,
0337  * and the CIL consumes a lot of memory. Hence we need to set an upper physical
0338  * size limit for the CIL that limits the maximum amount of memory pinned by the
0339  * CIL but does not limit performance by reducing relogging efficiency
0340  * significantly.
0341  *
0342  * As such, the CIL push threshold ends up being the smaller of two thresholds:
0343  * - a threshold large enough that it allows CIL to be pushed and progress to be
0344  *   made without excessive blocking of incoming transaction commits. This is
0345  *   defined to be 12.5% of the log space - half the 25% push threshold of the
0346  *   AIL.
0347  * - small enough that it doesn't pin excessive amounts of memory but maintains
0348  *   close to peak relogging efficiency. This is defined to be 16x the iclog
0349  *   buffer window (32MB) as measurements have shown this to be roughly the
0350  *   point of diminishing performance increases under highly concurrent
0351  *   modification workloads.
0352  *
0353  * To prevent the CIL from overflowing upper commit size bounds, we introduce a
0354  * new threshold at which we block committing transactions until the background
0355  * CIL commit commences and switches to a new context. While this is not a hard
0356  * limit, it forces the process committing a transaction to the CIL to block and
0357  * yeild the CPU, giving the CIL push work a chance to be scheduled and start
0358  * work. This prevents a process running lots of transactions from overfilling
0359  * the CIL because it is not yielding the CPU. We set the blocking limit at
0360  * twice the background push space threshold so we keep in line with the AIL
0361  * push thresholds.
0362  *
0363  * Note: this is not a -hard- limit as blocking is applied after the transaction
0364  * is inserted into the CIL and the push has been triggered. It is largely a
0365  * throttling mechanism that allows the CIL push to be scheduled and run. A hard
0366  * limit will be difficult to implement without introducing global serialisation
0367  * in the CIL commit fast path, and it's not at all clear that we actually need
0368  * such hard limits given the ~7 years we've run without a hard limit before
0369  * finding the first situation where a checkpoint size overflow actually
0370  * occurred. Hence the simple throttle, and an ASSERT check to tell us that
0371  * we've overrun the max size.
0372  */
0373 #define XLOG_CIL_SPACE_LIMIT(log)   \
0374     min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
0375
0376 #define XLOG_CIL_BLOCKING_SPACE_LIMIT(log)  \
0377     (XLOG_CIL_SPACE_LIMIT(log) * 2)
0378
0379 /*
0380  * ticket grant locks, queues and accounting have their own cachlines
0381  * as these are quite hot and can be operated on concurrently.
0382  */
0383 struct xlog_grant_head {
0384     spinlock_t      lock ____cacheline_aligned_in_smp;
0385     struct list_head    waiters;
0386     atomic64_t      grant;
0387 };
0388
0389 /*
0390  * The reservation head lsn is not made up of a cycle number and block number.
0391  * Instead, it uses a cycle number and byte number.  Logs don't expect to
0392  * overflow 31 bits worth of byte offset, so using a byte number will mean
0393  * that round off problems won't occur when releasing partial reservations.
0394  */
0395 struct xlog {
0396     /* The following fields don't need locking */
0397     struct xfs_mount    *l_mp;          /* mount point */
0398     struct xfs_ail      *l_ailp;    /* AIL log is working with */
0399     struct xfs_cil      *l_cilp;    /* CIL log is working with */
0400     struct xfs_buftarg  *l_targ;        /* buftarg of log */
0401     struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */
0402     struct delayed_work l_work;     /* background flush work */
0403     long            l_opstate;  /* operational state */
0404     uint            l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
0405     struct list_head    *l_buf_cancel_table;
0406     int         l_iclog_hsize;  /* size of iclog header */
0407     int         l_iclog_heads;  /* # of iclog header sectors */
0408     uint            l_sectBBsize;   /* sector size in BBs (2^n) */
0409     int         l_iclog_size;   /* size of log in bytes */
0410     int         l_iclog_bufs;   /* number of iclog buffers */
0411     xfs_daddr_t     l_logBBstart;   /* start block of log */
0412     int         l_logsize;      /* size of log in bytes */
0413     int         l_logBBsize;    /* size of log in BB chunks */
0414
0415     /* The following block of fields are changed while holding icloglock */
0416     wait_queue_head_t   l_flush_wait ____cacheline_aligned_in_smp;
0417                         /* waiting for iclog flush */
0418     int         l_covered_state;/* state of "covering disk
0419                          * log entries" */
0420     xlog_in_core_t      *l_iclog;       /* head log queue   */
0421     spinlock_t      l_icloglock;    /* grab to change iclog state */
0422     int         l_curr_cycle;   /* Cycle number of log writes */
0423     int         l_prev_cycle;   /* Cycle number before last
0424                          * block increment */
0425     int         l_curr_block;   /* current logical log block */
0426     int         l_prev_block;   /* previous logical log block */
0427
0428     /*
0429      * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
0430      * read without needing to hold specific locks. To avoid operations
0431      * contending with other hot objects, place each of them on a separate
0432      * cacheline.
0433      */
0434     /* lsn of last LR on disk */
0435     atomic64_t      l_last_sync_lsn ____cacheline_aligned_in_smp;
0436     /* lsn of 1st LR with unflushed * buffers */
0437     atomic64_t      l_tail_lsn ____cacheline_aligned_in_smp;
0438
0439     struct xlog_grant_head  l_reserve_head;
0440     struct xlog_grant_head  l_write_head;
0441
0442     struct xfs_kobj     l_kobj;
0443
0444     /* log recovery lsn tracking (for buffer submission */
0445     xfs_lsn_t       l_recovery_lsn;
0446
0447     uint32_t        l_iclog_roundoff;/* padding roundoff */
0448
0449     /* Users of log incompat features should take a read lock. */
0450     struct rw_semaphore l_incompat_users;
0451 };
0452
0453 /*
0454  * Bits for operational state
0455  */
0456 #define XLOG_ACTIVE_RECOVERY    0   /* in the middle of recovery */
0457 #define XLOG_RECOVERY_NEEDED    1   /* log was recovered */
0458 #define XLOG_IO_ERROR       2   /* log hit an I/O error, and being
0459                    shutdown */
0460 #define XLOG_TAIL_WARN      3   /* log tail verify warning issued */
0461
0462 static inline bool
0463 xlog_recovery_needed(struct xlog *log)
0464 {
0465     return test_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
0466 }
0467
0468 static inline bool
0469 xlog_in_recovery(struct xlog *log)
0470 {
0471     return test_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
0472 }
0473
0474 static inline bool
0475 xlog_is_shutdown(struct xlog *log)
0476 {
0477     return test_bit(XLOG_IO_ERROR, &log->l_opstate);
0478 }
0479
0480 /*
0481  * Wait until the xlog_force_shutdown() has marked the log as shut down
0482  * so xlog_is_shutdown() will always return true.
0483  */
0484 static inline void
0485 xlog_shutdown_wait(
0486     struct xlog *log)
0487 {
0488     wait_var_event(&log->l_opstate, xlog_is_shutdown(log));
0489 }
0490
0491 /* common routines */
0492 extern int
0493 xlog_recover(
0494     struct xlog     *log);
0495 extern int
0496 xlog_recover_finish(
0497     struct xlog     *log);
0498 extern void
0499 xlog_recover_cancel(struct xlog *);
0500
0501 extern __le32    xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
0502                 char *dp, int size);
0503
0504 extern struct kmem_cache *xfs_log_ticket_cache;
0505 struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
0506         int count, bool permanent);
0507
0508 void    xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
0509 void    xlog_print_trans(struct xfs_trans *);
0510 int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
0511         struct list_head *lv_chain, struct xlog_ticket *tic,
0512         uint32_t len);
0513 void    xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
0514 void    xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
0515
0516 void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
0517         int eventual_size);
0518 int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
0519         struct xlog_ticket *ticket);
0520
0521 /*
0522  * When we crack an atomic LSN, we sample it first so that the value will not
0523  * change while we are cracking it into the component values. This means we
0524  * will always get consistent component values to work from. This should always
0525  * be used to sample and crack LSNs that are stored and updated in atomic
0526  * variables.
0527  */
0528 static inline void
0529 xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
0530 {
0531     xfs_lsn_t val = atomic64_read(lsn);
0532
0533     *cycle = CYCLE_LSN(val);
0534     *block = BLOCK_LSN(val);
0535 }
0536
0537 /*
0538  * Calculate and assign a value to an atomic LSN variable from component pieces.
0539  */
0540 static inline void
0541 xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
0542 {
0543     atomic64_set(lsn, xlog_assign_lsn(cycle, block));
0544 }
0545
0546 /*
0547  * When we crack the grant head, we sample it first so that the value will not
0548  * change while we are cracking it into the component values. This means we
0549  * will always get consistent component values to work from.
0550  */
0551 static inline void
0552 xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
0553 {
0554     *cycle = val >> 32;
0555     *space = val & 0xffffffff;
0556 }
0557
0558 static inline void
0559 xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
0560 {
0561     xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
0562 }
0563
0564 static inline int64_t
0565 xlog_assign_grant_head_val(int cycle, int space)
0566 {
0567     return ((int64_t)cycle << 32) | space;
0568 }
0569
0570 static inline void
0571 xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
0572 {
0573     atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
0574 }
0575
0576 /*
0577  * Committed Item List interfaces
0578  */
0579 int xlog_cil_init(struct xlog *log);
0580 void    xlog_cil_init_post_recovery(struct xlog *log);
0581 void    xlog_cil_destroy(struct xlog *log);
0582 bool    xlog_cil_empty(struct xlog *log);
0583 void    xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
0584             xfs_csn_t *commit_seq, bool regrant);
0585 void    xlog_cil_set_ctx_write_state(struct xfs_cil_ctx *ctx,
0586             struct xlog_in_core *iclog);
0587
0588
0589 /*
0590  * CIL force routines
0591  */
0592 void xlog_cil_flush(struct xlog *log);
0593 xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
0594
0595 static inline void
0596 xlog_cil_force(struct xlog *log)
0597 {
0598     xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence);
0599 }
0600
0601 /*
0602  * Wrapper function for waiting on a wait queue serialised against wakeups
0603  * by a spinlock. This matches the semantics of all the wait queues used in the
0604  * log code.
0605  */
0606 static inline void
0607 xlog_wait(
0608     struct wait_queue_head  *wq,
0609     struct spinlock     *lock)
0610         __releases(lock)
0611 {
0612     DECLARE_WAITQUEUE(wait, current);
0613
0614     add_wait_queue_exclusive(wq, &wait);
0615     __set_current_state(TASK_UNINTERRUPTIBLE);
0616     spin_unlock(lock);
0617     schedule();
0618     remove_wait_queue(wq, &wait);
0619 }
0620
0621 int xlog_wait_on_iclog(struct xlog_in_core *iclog);
0622
0623 /*
0624  * The LSN is valid so long as it is behind the current LSN. If it isn't, this
0625  * means that the next log record that includes this metadata could have a
0626  * smaller LSN. In turn, this means that the modification in the log would not
0627  * replay.
0628  */
0629 static inline bool
0630 xlog_valid_lsn(
0631     struct xlog *log,
0632     xfs_lsn_t   lsn)
0633 {
0634     int     cur_cycle;
0635     int     cur_block;
0636     bool        valid = true;
0637
0638     /*
0639      * First, sample the current lsn without locking to avoid added
0640      * contention from metadata I/O. The current cycle and block are updated
0641      * (in xlog_state_switch_iclogs()) and read here in a particular order
0642      * to avoid false negatives (e.g., thinking the metadata LSN is valid
0643      * when it is not).
0644      *
0645      * The current block is always rewound before the cycle is bumped in
0646      * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
0647      * a transiently forward state. Instead, we can see the LSN in a
0648      * transiently behind state if we happen to race with a cycle wrap.
0649      */
0650     cur_cycle = READ_ONCE(log->l_curr_cycle);
0651     smp_rmb();
0652     cur_block = READ_ONCE(log->l_curr_block);
0653
0654     if ((CYCLE_LSN(lsn) > cur_cycle) ||
0655         (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
0656         /*
0657          * If the metadata LSN appears invalid, it's possible the check
0658          * above raced with a wrap to the next log cycle. Grab the lock
0659          * to check for sure.
0660          */
0661         spin_lock(&log->l_icloglock);
0662         cur_cycle = log->l_curr_cycle;
0663         cur_block = log->l_curr_block;
0664         spin_unlock(&log->l_icloglock);
0665
0666         if ((CYCLE_LSN(lsn) > cur_cycle) ||
0667             (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
0668             valid = false;
0669     }
0670
0671     return valid;
0672 }
0673
0674 /*
0675  * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
0676  * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
0677  * to fall back to vmalloc, so we can't actually do anything useful with gfp
0678  * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
0679  * will do direct reclaim and compaction in the slow path, both of which are
0680  * horrendously expensive. We just want kmalloc to fail fast and fall back to
0681  * vmalloc if it can't get somethign straight away from the free lists or
0682  * buddy allocator. Hence we have to open code kvmalloc outselves here.
0683  *
0684  * This assumes that the caller uses memalloc_nofs_save task context here, so
0685  * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
0686  * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
0687  * allocations, so lets just all pretend this is a GFP_KERNEL context
0688  * operation....
0689  */
0690 static inline void *
0691 xlog_kvmalloc(
0692     size_t      buf_size)
0693 {
0694     gfp_t       flags = GFP_KERNEL;
0695     void        *p;
0696
0697     flags &= ~__GFP_DIRECT_RECLAIM;
0698     flags |= __GFP_NOWARN | __GFP_NORETRY;
0699     do {
0700         p = kmalloc(buf_size, flags);
0701         if (!p)
0702             p = vmalloc(buf_size);
0703     } while (!p);
0704
0705     return p;
0706 }
0707
0708 /*
0709  * CIL CPU dead notifier
0710  */
0711 void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
0712
0713 #endif  /* __XFS_LOG_PRIV_H__ */