Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0+
0002 /*
0003  * Copyright (C) 2016 Oracle.  All Rights Reserved.
0004  * Author: Darrick J. Wong <darrick.wong@oracle.com>
0005  */
0006 #include "xfs.h"
0007 #include "xfs_fs.h"
0008 #include "xfs_shared.h"
0009 #include "xfs_format.h"
0010 #include "xfs_log_format.h"
0011 #include "xfs_trans_resv.h"
0012 #include "xfs_mount.h"
0013 #include "xfs_alloc.h"
0014 #include "xfs_errortag.h"
0015 #include "xfs_error.h"
0016 #include "xfs_trace.h"
0017 #include "xfs_trans.h"
0018 #include "xfs_rmap_btree.h"
0019 #include "xfs_btree.h"
0020 #include "xfs_refcount_btree.h"
0021 #include "xfs_ialloc_btree.h"
0022 #include "xfs_ag.h"
0023 #include "xfs_ag_resv.h"
0024 
0025 /*
0026  * Per-AG Block Reservations
0027  *
0028  * For some kinds of allocation group metadata structures, it is advantageous
0029  * to reserve a small number of blocks in each AG so that future expansions of
0030  * that data structure do not encounter ENOSPC because errors during a btree
0031  * split cause the filesystem to go offline.
0032  *
0033  * Prior to the introduction of reflink, this wasn't an issue because the free
0034  * space btrees maintain a reserve of space (the AGFL) to handle any expansion
0035  * that may be necessary; and allocations of other metadata (inodes, BMBT,
0036  * dir/attr) aren't restricted to a single AG.  However, with reflink it is
0037  * possible to allocate all the space in an AG, have subsequent reflink/CoW
0038  * activity expand the refcount btree, and discover that there's no space left
0039  * to handle that expansion.  Since we can calculate the maximum size of the
0040  * refcount btree, we can reserve space for it and avoid ENOSPC.
0041  *
0042  * Handling per-AG reservations consists of three changes to the allocator's
0043  * behavior:  First, because these reservations are always needed, we decrease
0044  * the ag_max_usable counter to reflect the size of the AG after the reserved
0045  * blocks are taken.  Second, the reservations must be reflected in the
0046  * fdblocks count to maintain proper accounting.  Third, each AG must maintain
0047  * its own reserved block counter so that we can calculate the amount of space
0048  * that must remain free to maintain the reservations.  Fourth, the "remaining
0049  * reserved blocks" count must be used when calculating the length of the
0050  * longest free extent in an AG and to clamp maxlen in the per-AG allocation
0051  * functions.  In other words, we maintain a virtual allocation via in-core
0052  * accounting tricks so that we don't have to clean up after a crash. :)
0053  *
0054  * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
0055  * values via struct xfs_alloc_arg or directly to the xfs_free_extent
0056  * function.  It might seem a little funny to maintain a reservoir of blocks
0057  * to feed another reservoir, but the AGFL only holds enough blocks to get
0058  * through the next transaction.  The per-AG reservation is to ensure (we
0059  * hope) that each AG never runs out of blocks.  Each data structure wanting
0060  * to use the reservation system should update ask/used in xfs_ag_resv_init.
0061  */
0062 
0063 /*
0064  * Are we critically low on blocks?  For now we'll define that as the number
0065  * of blocks we can get our hands on being less than 10% of what we reserved
0066  * or less than some arbitrary number (maximum btree height).
0067  */
0068 bool
0069 xfs_ag_resv_critical(
0070     struct xfs_perag        *pag,
0071     enum xfs_ag_resv_type       type)
0072 {
0073     xfs_extlen_t            avail;
0074     xfs_extlen_t            orig;
0075 
0076     switch (type) {
0077     case XFS_AG_RESV_METADATA:
0078         avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
0079         orig = pag->pag_meta_resv.ar_asked;
0080         break;
0081     case XFS_AG_RESV_RMAPBT:
0082         avail = pag->pagf_freeblks + pag->pagf_flcount -
0083             pag->pag_meta_resv.ar_reserved;
0084         orig = pag->pag_rmapbt_resv.ar_asked;
0085         break;
0086     default:
0087         ASSERT(0);
0088         return false;
0089     }
0090 
0091     trace_xfs_ag_resv_critical(pag, type, avail);
0092 
0093     /* Critically low if less than 10% or max btree height remains. */
0094     return XFS_TEST_ERROR(avail < orig / 10 ||
0095                   avail < pag->pag_mount->m_agbtree_maxlevels,
0096             pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
0097 }
0098 
0099 /*
0100  * How many blocks are reserved but not used, and therefore must not be
0101  * allocated away?
0102  */
0103 xfs_extlen_t
0104 xfs_ag_resv_needed(
0105     struct xfs_perag        *pag,
0106     enum xfs_ag_resv_type       type)
0107 {
0108     xfs_extlen_t            len;
0109 
0110     len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
0111     switch (type) {
0112     case XFS_AG_RESV_METADATA:
0113     case XFS_AG_RESV_RMAPBT:
0114         len -= xfs_perag_resv(pag, type)->ar_reserved;
0115         break;
0116     case XFS_AG_RESV_NONE:
0117         /* empty */
0118         break;
0119     default:
0120         ASSERT(0);
0121     }
0122 
0123     trace_xfs_ag_resv_needed(pag, type, len);
0124 
0125     return len;
0126 }
0127 
0128 /* Clean out a reservation */
0129 static int
0130 __xfs_ag_resv_free(
0131     struct xfs_perag        *pag,
0132     enum xfs_ag_resv_type       type)
0133 {
0134     struct xfs_ag_resv      *resv;
0135     xfs_extlen_t            oldresv;
0136     int             error;
0137 
0138     trace_xfs_ag_resv_free(pag, type, 0);
0139 
0140     resv = xfs_perag_resv(pag, type);
0141     if (pag->pag_agno == 0)
0142         pag->pag_mount->m_ag_max_usable += resv->ar_asked;
0143     /*
0144      * RMAPBT blocks come from the AGFL and AGFL blocks are always
0145      * considered "free", so whatever was reserved at mount time must be
0146      * given back at umount.
0147      */
0148     if (type == XFS_AG_RESV_RMAPBT)
0149         oldresv = resv->ar_orig_reserved;
0150     else
0151         oldresv = resv->ar_reserved;
0152     error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
0153     resv->ar_reserved = 0;
0154     resv->ar_asked = 0;
0155     resv->ar_orig_reserved = 0;
0156 
0157     if (error)
0158         trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
0159                 error, _RET_IP_);
0160     return error;
0161 }
0162 
0163 /* Free a per-AG reservation. */
0164 int
0165 xfs_ag_resv_free(
0166     struct xfs_perag        *pag)
0167 {
0168     int             error;
0169     int             err2;
0170 
0171     error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
0172     err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
0173     if (err2 && !error)
0174         error = err2;
0175     return error;
0176 }
0177 
0178 static int
0179 __xfs_ag_resv_init(
0180     struct xfs_perag        *pag,
0181     enum xfs_ag_resv_type       type,
0182     xfs_extlen_t            ask,
0183     xfs_extlen_t            used)
0184 {
0185     struct xfs_mount        *mp = pag->pag_mount;
0186     struct xfs_ag_resv      *resv;
0187     int             error;
0188     xfs_extlen_t            hidden_space;
0189 
0190     if (used > ask)
0191         ask = used;
0192 
0193     switch (type) {
0194     case XFS_AG_RESV_RMAPBT:
0195         /*
0196          * Space taken by the rmapbt is not subtracted from fdblocks
0197          * because the rmapbt lives in the free space.  Here we must
0198          * subtract the entire reservation from fdblocks so that we
0199          * always have blocks available for rmapbt expansion.
0200          */
0201         hidden_space = ask;
0202         break;
0203     case XFS_AG_RESV_METADATA:
0204         /*
0205          * Space taken by all other metadata btrees are accounted
0206          * on-disk as used space.  We therefore only hide the space
0207          * that is reserved but not used by the trees.
0208          */
0209         hidden_space = ask - used;
0210         break;
0211     default:
0212         ASSERT(0);
0213         return -EINVAL;
0214     }
0215 
0216     if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
0217         error = -ENOSPC;
0218     else
0219         error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
0220     if (error) {
0221         trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
0222                 error, _RET_IP_);
0223         xfs_warn(mp,
0224 "Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
0225                 pag->pag_agno);
0226         return error;
0227     }
0228 
0229     /*
0230      * Reduce the maximum per-AG allocation length by however much we're
0231      * trying to reserve for an AG.  Since this is a filesystem-wide
0232      * counter, we only make the adjustment for AG 0.  This assumes that
0233      * there aren't any AGs hungrier for per-AG reservation than AG 0.
0234      */
0235     if (pag->pag_agno == 0)
0236         mp->m_ag_max_usable -= ask;
0237 
0238     resv = xfs_perag_resv(pag, type);
0239     resv->ar_asked = ask;
0240     resv->ar_orig_reserved = hidden_space;
0241     resv->ar_reserved = ask - used;
0242 
0243     trace_xfs_ag_resv_init(pag, type, ask);
0244     return 0;
0245 }
0246 
0247 /* Create a per-AG block reservation. */
0248 int
0249 xfs_ag_resv_init(
0250     struct xfs_perag        *pag,
0251     struct xfs_trans        *tp)
0252 {
0253     struct xfs_mount        *mp = pag->pag_mount;
0254     xfs_extlen_t            ask;
0255     xfs_extlen_t            used;
0256     int             error = 0, error2;
0257     bool                has_resv = false;
0258 
0259     /* Create the metadata reservation. */
0260     if (pag->pag_meta_resv.ar_asked == 0) {
0261         ask = used = 0;
0262 
0263         error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
0264         if (error)
0265             goto out;
0266 
0267         error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
0268         if (error)
0269             goto out;
0270 
0271         error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
0272                 ask, used);
0273         if (error) {
0274             /*
0275              * Because we didn't have per-AG reservations when the
0276              * finobt feature was added we might not be able to
0277              * reserve all needed blocks.  Warn and fall back to the
0278              * old and potentially buggy code in that case, but
0279              * ensure we do have the reservation for the refcountbt.
0280              */
0281             ask = used = 0;
0282 
0283             mp->m_finobt_nores = true;
0284 
0285             error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
0286                     &used);
0287             if (error)
0288                 goto out;
0289 
0290             error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
0291                     ask, used);
0292             if (error)
0293                 goto out;
0294         }
0295         if (ask)
0296             has_resv = true;
0297     }
0298 
0299     /* Create the RMAPBT metadata reservation */
0300     if (pag->pag_rmapbt_resv.ar_asked == 0) {
0301         ask = used = 0;
0302 
0303         error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
0304         if (error)
0305             goto out;
0306 
0307         error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
0308         if (error)
0309             goto out;
0310         if (ask)
0311             has_resv = true;
0312     }
0313 
0314 out:
0315     /*
0316      * Initialize the pagf if we have at least one active reservation on the
0317      * AG. This may have occurred already via reservation calculation, but
0318      * fall back to an explicit init to ensure the in-core allocbt usage
0319      * counters are initialized as soon as possible. This is important
0320      * because filesystems with large perag reservations are susceptible to
0321      * free space reservation problems that the allocbt counter is used to
0322      * address.
0323      */
0324     if (has_resv) {
0325         error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
0326         if (error2)
0327             return error2;
0328 
0329         /*
0330          * If there isn't enough space in the AG to satisfy the
0331          * reservation, let the caller know that there wasn't enough
0332          * space.  Callers are responsible for deciding what to do
0333          * next, since (in theory) we can stumble along with
0334          * insufficient reservation if data blocks are being freed to
0335          * replenish the AG's free space.
0336          */
0337         if (!error &&
0338             xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
0339             xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
0340             pag->pagf_freeblks + pag->pagf_flcount)
0341             error = -ENOSPC;
0342     }
0343 
0344     return error;
0345 }
0346 
0347 /* Allocate a block from the reservation. */
0348 void
0349 xfs_ag_resv_alloc_extent(
0350     struct xfs_perag        *pag,
0351     enum xfs_ag_resv_type       type,
0352     struct xfs_alloc_arg        *args)
0353 {
0354     struct xfs_ag_resv      *resv;
0355     xfs_extlen_t            len;
0356     uint                field;
0357 
0358     trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
0359 
0360     switch (type) {
0361     case XFS_AG_RESV_AGFL:
0362         return;
0363     case XFS_AG_RESV_METADATA:
0364     case XFS_AG_RESV_RMAPBT:
0365         resv = xfs_perag_resv(pag, type);
0366         break;
0367     default:
0368         ASSERT(0);
0369         fallthrough;
0370     case XFS_AG_RESV_NONE:
0371         field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
0372                        XFS_TRANS_SB_FDBLOCKS;
0373         xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
0374         return;
0375     }
0376 
0377     len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
0378     resv->ar_reserved -= len;
0379     if (type == XFS_AG_RESV_RMAPBT)
0380         return;
0381     /* Allocations of reserved blocks only need on-disk sb updates... */
0382     xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
0383     /* ...but non-reserved blocks need in-core and on-disk updates. */
0384     if (args->len > len)
0385         xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
0386                 -((int64_t)args->len - len));
0387 }
0388 
0389 /* Free a block to the reservation. */
0390 void
0391 xfs_ag_resv_free_extent(
0392     struct xfs_perag        *pag,
0393     enum xfs_ag_resv_type       type,
0394     struct xfs_trans        *tp,
0395     xfs_extlen_t            len)
0396 {
0397     xfs_extlen_t            leftover;
0398     struct xfs_ag_resv      *resv;
0399 
0400     trace_xfs_ag_resv_free_extent(pag, type, len);
0401 
0402     switch (type) {
0403     case XFS_AG_RESV_AGFL:
0404         return;
0405     case XFS_AG_RESV_METADATA:
0406     case XFS_AG_RESV_RMAPBT:
0407         resv = xfs_perag_resv(pag, type);
0408         break;
0409     default:
0410         ASSERT(0);
0411         fallthrough;
0412     case XFS_AG_RESV_NONE:
0413         xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
0414         return;
0415     }
0416 
0417     leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
0418     resv->ar_reserved += leftover;
0419     if (type == XFS_AG_RESV_RMAPBT)
0420         return;
0421     /* Freeing into the reserved pool only requires on-disk update... */
0422     xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
0423     /* ...but freeing beyond that requires in-core and on-disk update. */
0424     if (len > leftover)
0425         xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
0426 }