Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
0004  * All Rights Reserved.
0005  */
0006 #include "xfs.h"
0007 #include "xfs_fs.h"
0008 #include "xfs_format.h"
0009 #include "xfs_log_format.h"
0010 #include "xfs_shared.h"
0011 #include "xfs_trans_resv.h"
0012 #include "xfs_bit.h"
0013 #include "xfs_mount.h"
0014 #include "xfs_defer.h"
0015 #include "xfs_btree.h"
0016 #include "xfs_rmap.h"
0017 #include "xfs_alloc_btree.h"
0018 #include "xfs_alloc.h"
0019 #include "xfs_extent_busy.h"
0020 #include "xfs_errortag.h"
0021 #include "xfs_error.h"
0022 #include "xfs_trace.h"
0023 #include "xfs_trans.h"
0024 #include "xfs_buf_item.h"
0025 #include "xfs_log.h"
0026 #include "xfs_ag.h"
0027 #include "xfs_ag_resv.h"
0028 #include "xfs_bmap.h"
0029 
0030 struct kmem_cache   *xfs_extfree_item_cache;
0031 
0032 struct workqueue_struct *xfs_alloc_wq;
0033 
0034 #define XFS_ABSDIFF(a,b)    (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
0035 
0036 #define XFSA_FIXUP_BNO_OK   1
0037 #define XFSA_FIXUP_CNT_OK   2
0038 
0039 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
0040 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
0041 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
0042 
0043 /*
0044  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
0045  * the beginning of the block for a proper header with the location information
0046  * and CRC.
0047  */
0048 unsigned int
0049 xfs_agfl_size(
0050     struct xfs_mount    *mp)
0051 {
0052     unsigned int        size = mp->m_sb.sb_sectsize;
0053 
0054     if (xfs_has_crc(mp))
0055         size -= sizeof(struct xfs_agfl);
0056 
0057     return size / sizeof(xfs_agblock_t);
0058 }
0059 
0060 unsigned int
0061 xfs_refc_block(
0062     struct xfs_mount    *mp)
0063 {
0064     if (xfs_has_rmapbt(mp))
0065         return XFS_RMAP_BLOCK(mp) + 1;
0066     if (xfs_has_finobt(mp))
0067         return XFS_FIBT_BLOCK(mp) + 1;
0068     return XFS_IBT_BLOCK(mp) + 1;
0069 }
0070 
0071 xfs_extlen_t
0072 xfs_prealloc_blocks(
0073     struct xfs_mount    *mp)
0074 {
0075     if (xfs_has_reflink(mp))
0076         return xfs_refc_block(mp) + 1;
0077     if (xfs_has_rmapbt(mp))
0078         return XFS_RMAP_BLOCK(mp) + 1;
0079     if (xfs_has_finobt(mp))
0080         return XFS_FIBT_BLOCK(mp) + 1;
0081     return XFS_IBT_BLOCK(mp) + 1;
0082 }
0083 
0084 /*
0085  * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
0086  * guarantee that we can refill the AGFL prior to allocating space in a nearly
0087  * full AG.  Although the space described by the free space btrees, the
0088  * blocks used by the freesp btrees themselves, and the blocks owned by the
0089  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
0090  * free space in the AG drop so low that the free space btrees cannot refill an
0091  * empty AGFL up to the minimum level.  Rather than grind through empty AGs
0092  * until the fs goes down, we subtract this many AG blocks from the incore
0093  * fdblocks to ensure user allocation does not overcommit the space the
0094  * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
0095  * withhold space from xfs_mod_fdblocks, so we do not account for that here.
0096  */
0097 #define XFS_ALLOCBT_AGFL_RESERVE    4
0098 
0099 /*
0100  * Compute the number of blocks that we set aside to guarantee the ability to
0101  * refill the AGFL and handle a full bmap btree split.
0102  *
0103  * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
0104  * AGF buffer (PV 947395), we place constraints on the relationship among
0105  * actual allocations for data blocks, freelist blocks, and potential file data
0106  * bmap btree blocks. However, these restrictions may result in no actual space
0107  * allocated for a delayed extent, for example, a data block in a certain AG is
0108  * allocated but there is no additional block for the additional bmap btree
0109  * block due to a split of the bmap btree of the file. The result of this may
0110  * lead to an infinite loop when the file gets flushed to disk and all delayed
0111  * extents need to be actually allocated. To get around this, we explicitly set
0112  * aside a few blocks which will not be reserved in delayed allocation.
0113  *
0114  * For each AG, we need to reserve enough blocks to replenish a totally empty
0115  * AGFL and 4 more to handle a potential split of the file's bmap btree.
0116  */
0117 unsigned int
0118 xfs_alloc_set_aside(
0119     struct xfs_mount    *mp)
0120 {
0121     return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
0122 }
0123 
0124 /*
0125  * When deciding how much space to allocate out of an AG, we limit the
0126  * allocation maximum size to the size the AG. However, we cannot use all the
0127  * blocks in the AG - some are permanently used by metadata. These
0128  * blocks are generally:
0129  *  - the AG superblock, AGF, AGI and AGFL
0130  *  - the AGF (bno and cnt) and AGI btree root blocks, and optionally
0131  *    the AGI free inode and rmap btree root blocks.
0132  *  - blocks on the AGFL according to xfs_alloc_set_aside() limits
0133  *  - the rmapbt root block
0134  *
0135  * The AG headers are sector sized, so the amount of space they take up is
0136  * dependent on filesystem geometry. The others are all single blocks.
0137  */
0138 unsigned int
0139 xfs_alloc_ag_max_usable(
0140     struct xfs_mount    *mp)
0141 {
0142     unsigned int        blocks;
0143 
0144     blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
0145     blocks += XFS_ALLOCBT_AGFL_RESERVE;
0146     blocks += 3;            /* AGF, AGI btree root blocks */
0147     if (xfs_has_finobt(mp))
0148         blocks++;       /* finobt root block */
0149     if (xfs_has_rmapbt(mp))
0150         blocks++;       /* rmap root block */
0151     if (xfs_has_reflink(mp))
0152         blocks++;       /* refcount root block */
0153 
0154     return mp->m_sb.sb_agblocks - blocks;
0155 }
0156 
0157 /*
0158  * Lookup the record equal to [bno, len] in the btree given by cur.
0159  */
0160 STATIC int              /* error */
0161 xfs_alloc_lookup_eq(
0162     struct xfs_btree_cur    *cur,   /* btree cursor */
0163     xfs_agblock_t       bno,    /* starting block of extent */
0164     xfs_extlen_t        len,    /* length of extent */
0165     int         *stat)  /* success/failure */
0166 {
0167     int         error;
0168 
0169     cur->bc_rec.a.ar_startblock = bno;
0170     cur->bc_rec.a.ar_blockcount = len;
0171     error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
0172     cur->bc_ag.abt.active = (*stat == 1);
0173     return error;
0174 }
0175 
0176 /*
0177  * Lookup the first record greater than or equal to [bno, len]
0178  * in the btree given by cur.
0179  */
0180 int             /* error */
0181 xfs_alloc_lookup_ge(
0182     struct xfs_btree_cur    *cur,   /* btree cursor */
0183     xfs_agblock_t       bno,    /* starting block of extent */
0184     xfs_extlen_t        len,    /* length of extent */
0185     int         *stat)  /* success/failure */
0186 {
0187     int         error;
0188 
0189     cur->bc_rec.a.ar_startblock = bno;
0190     cur->bc_rec.a.ar_blockcount = len;
0191     error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
0192     cur->bc_ag.abt.active = (*stat == 1);
0193     return error;
0194 }
0195 
0196 /*
0197  * Lookup the first record less than or equal to [bno, len]
0198  * in the btree given by cur.
0199  */
0200 int                 /* error */
0201 xfs_alloc_lookup_le(
0202     struct xfs_btree_cur    *cur,   /* btree cursor */
0203     xfs_agblock_t       bno,    /* starting block of extent */
0204     xfs_extlen_t        len,    /* length of extent */
0205     int         *stat)  /* success/failure */
0206 {
0207     int         error;
0208     cur->bc_rec.a.ar_startblock = bno;
0209     cur->bc_rec.a.ar_blockcount = len;
0210     error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
0211     cur->bc_ag.abt.active = (*stat == 1);
0212     return error;
0213 }
0214 
0215 static inline bool
0216 xfs_alloc_cur_active(
0217     struct xfs_btree_cur    *cur)
0218 {
0219     return cur && cur->bc_ag.abt.active;
0220 }
0221 
0222 /*
0223  * Update the record referred to by cur to the value given
0224  * by [bno, len].
0225  * This either works (return 0) or gets an EFSCORRUPTED error.
0226  */
0227 STATIC int              /* error */
0228 xfs_alloc_update(
0229     struct xfs_btree_cur    *cur,   /* btree cursor */
0230     xfs_agblock_t       bno,    /* starting block of extent */
0231     xfs_extlen_t        len)    /* length of extent */
0232 {
0233     union xfs_btree_rec rec;
0234 
0235     rec.alloc.ar_startblock = cpu_to_be32(bno);
0236     rec.alloc.ar_blockcount = cpu_to_be32(len);
0237     return xfs_btree_update(cur, &rec);
0238 }
0239 
0240 /*
0241  * Get the data from the pointed-to record.
0242  */
0243 int                 /* error */
0244 xfs_alloc_get_rec(
0245     struct xfs_btree_cur    *cur,   /* btree cursor */
0246     xfs_agblock_t       *bno,   /* output: starting block of extent */
0247     xfs_extlen_t        *len,   /* output: length of extent */
0248     int         *stat)  /* output: success/failure */
0249 {
0250     struct xfs_mount    *mp = cur->bc_mp;
0251     struct xfs_perag    *pag = cur->bc_ag.pag;
0252     union xfs_btree_rec *rec;
0253     int         error;
0254 
0255     error = xfs_btree_get_rec(cur, &rec, stat);
0256     if (error || !(*stat))
0257         return error;
0258 
0259     *bno = be32_to_cpu(rec->alloc.ar_startblock);
0260     *len = be32_to_cpu(rec->alloc.ar_blockcount);
0261 
0262     if (*len == 0)
0263         goto out_bad_rec;
0264 
0265     /* check for valid extent range, including overflow */
0266     if (!xfs_verify_agbno(pag, *bno))
0267         goto out_bad_rec;
0268     if (*bno > *bno + *len)
0269         goto out_bad_rec;
0270     if (!xfs_verify_agbno(pag, *bno + *len - 1))
0271         goto out_bad_rec;
0272 
0273     return 0;
0274 
0275 out_bad_rec:
0276     xfs_warn(mp,
0277         "%s Freespace BTree record corruption in AG %d detected!",
0278         cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
0279         pag->pag_agno);
0280     xfs_warn(mp,
0281         "start block 0x%x block count 0x%x", *bno, *len);
0282     return -EFSCORRUPTED;
0283 }
0284 
0285 /*
0286  * Compute aligned version of the found extent.
0287  * Takes alignment and min length into account.
0288  */
0289 STATIC bool
0290 xfs_alloc_compute_aligned(
0291     xfs_alloc_arg_t *args,      /* allocation argument structure */
0292     xfs_agblock_t   foundbno,   /* starting block in found extent */
0293     xfs_extlen_t    foundlen,   /* length in found extent */
0294     xfs_agblock_t   *resbno,    /* result block number */
0295     xfs_extlen_t    *reslen,    /* result length */
0296     unsigned    *busy_gen)
0297 {
0298     xfs_agblock_t   bno = foundbno;
0299     xfs_extlen_t    len = foundlen;
0300     xfs_extlen_t    diff;
0301     bool        busy;
0302 
0303     /* Trim busy sections out of found extent */
0304     busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
0305 
0306     /*
0307      * If we have a largish extent that happens to start before min_agbno,
0308      * see if we can shift it into range...
0309      */
0310     if (bno < args->min_agbno && bno + len > args->min_agbno) {
0311         diff = args->min_agbno - bno;
0312         if (len > diff) {
0313             bno += diff;
0314             len -= diff;
0315         }
0316     }
0317 
0318     if (args->alignment > 1 && len >= args->minlen) {
0319         xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
0320 
0321         diff = aligned_bno - bno;
0322 
0323         *resbno = aligned_bno;
0324         *reslen = diff >= len ? 0 : len - diff;
0325     } else {
0326         *resbno = bno;
0327         *reslen = len;
0328     }
0329 
0330     return busy;
0331 }
0332 
0333 /*
0334  * Compute best start block and diff for "near" allocations.
0335  * freelen >= wantlen already checked by caller.
0336  */
0337 STATIC xfs_extlen_t         /* difference value (absolute) */
0338 xfs_alloc_compute_diff(
0339     xfs_agblock_t   wantbno,    /* target starting block */
0340     xfs_extlen_t    wantlen,    /* target length */
0341     xfs_extlen_t    alignment,  /* target alignment */
0342     int     datatype,   /* are we allocating data? */
0343     xfs_agblock_t   freebno,    /* freespace's starting block */
0344     xfs_extlen_t    freelen,    /* freespace's length */
0345     xfs_agblock_t   *newbnop)   /* result: best start block from free */
0346 {
0347     xfs_agblock_t   freeend;    /* end of freespace extent */
0348     xfs_agblock_t   newbno1;    /* return block number */
0349     xfs_agblock_t   newbno2;    /* other new block number */
0350     xfs_extlen_t    newlen1=0;  /* length with newbno1 */
0351     xfs_extlen_t    newlen2=0;  /* length with newbno2 */
0352     xfs_agblock_t   wantend;    /* end of target extent */
0353     bool        userdata = datatype & XFS_ALLOC_USERDATA;
0354 
0355     ASSERT(freelen >= wantlen);
0356     freeend = freebno + freelen;
0357     wantend = wantbno + wantlen;
0358     /*
0359      * We want to allocate from the start of a free extent if it is past
0360      * the desired block or if we are allocating user data and the free
0361      * extent is before desired block. The second case is there to allow
0362      * for contiguous allocation from the remaining free space if the file
0363      * grows in the short term.
0364      */
0365     if (freebno >= wantbno || (userdata && freeend < wantend)) {
0366         if ((newbno1 = roundup(freebno, alignment)) >= freeend)
0367             newbno1 = NULLAGBLOCK;
0368     } else if (freeend >= wantend && alignment > 1) {
0369         newbno1 = roundup(wantbno, alignment);
0370         newbno2 = newbno1 - alignment;
0371         if (newbno1 >= freeend)
0372             newbno1 = NULLAGBLOCK;
0373         else
0374             newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
0375         if (newbno2 < freebno)
0376             newbno2 = NULLAGBLOCK;
0377         else
0378             newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
0379         if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
0380             if (newlen1 < newlen2 ||
0381                 (newlen1 == newlen2 &&
0382                  XFS_ABSDIFF(newbno1, wantbno) >
0383                  XFS_ABSDIFF(newbno2, wantbno)))
0384                 newbno1 = newbno2;
0385         } else if (newbno2 != NULLAGBLOCK)
0386             newbno1 = newbno2;
0387     } else if (freeend >= wantend) {
0388         newbno1 = wantbno;
0389     } else if (alignment > 1) {
0390         newbno1 = roundup(freeend - wantlen, alignment);
0391         if (newbno1 > freeend - wantlen &&
0392             newbno1 - alignment >= freebno)
0393             newbno1 -= alignment;
0394         else if (newbno1 >= freeend)
0395             newbno1 = NULLAGBLOCK;
0396     } else
0397         newbno1 = freeend - wantlen;
0398     *newbnop = newbno1;
0399     return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
0400 }
0401 
0402 /*
0403  * Fix up the length, based on mod and prod.
0404  * len should be k * prod + mod for some k.
0405  * If len is too small it is returned unchanged.
0406  * If len hits maxlen it is left alone.
0407  */
0408 STATIC void
0409 xfs_alloc_fix_len(
0410     xfs_alloc_arg_t *args)      /* allocation argument structure */
0411 {
0412     xfs_extlen_t    k;
0413     xfs_extlen_t    rlen;
0414 
0415     ASSERT(args->mod < args->prod);
0416     rlen = args->len;
0417     ASSERT(rlen >= args->minlen);
0418     ASSERT(rlen <= args->maxlen);
0419     if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
0420         (args->mod == 0 && rlen < args->prod))
0421         return;
0422     k = rlen % args->prod;
0423     if (k == args->mod)
0424         return;
0425     if (k > args->mod)
0426         rlen = rlen - (k - args->mod);
0427     else
0428         rlen = rlen - args->prod + (args->mod - k);
0429     /* casts to (int) catch length underflows */
0430     if ((int)rlen < (int)args->minlen)
0431         return;
0432     ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
0433     ASSERT(rlen % args->prod == args->mod);
0434     ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
0435         rlen + args->minleft);
0436     args->len = rlen;
0437 }
0438 
0439 /*
0440  * Update the two btrees, logically removing from freespace the extent
0441  * starting at rbno, rlen blocks.  The extent is contained within the
0442  * actual (current) free extent fbno for flen blocks.
0443  * Flags are passed in indicating whether the cursors are set to the
0444  * relevant records.
0445  */
0446 STATIC int              /* error code */
0447 xfs_alloc_fixup_trees(
0448     struct xfs_btree_cur *cnt_cur,  /* cursor for by-size btree */
0449     struct xfs_btree_cur *bno_cur,  /* cursor for by-block btree */
0450     xfs_agblock_t   fbno,       /* starting block of free extent */
0451     xfs_extlen_t    flen,       /* length of free extent */
0452     xfs_agblock_t   rbno,       /* starting block of returned extent */
0453     xfs_extlen_t    rlen,       /* length of returned extent */
0454     int     flags)      /* flags, XFSA_FIXUP_... */
0455 {
0456     int     error;      /* error code */
0457     int     i;      /* operation results */
0458     xfs_agblock_t   nfbno1;     /* first new free startblock */
0459     xfs_agblock_t   nfbno2;     /* second new free startblock */
0460     xfs_extlen_t    nflen1=0;   /* first new free length */
0461     xfs_extlen_t    nflen2=0;   /* second new free length */
0462     struct xfs_mount *mp;
0463 
0464     mp = cnt_cur->bc_mp;
0465 
0466     /*
0467      * Look up the record in the by-size tree if necessary.
0468      */
0469     if (flags & XFSA_FIXUP_CNT_OK) {
0470 #ifdef DEBUG
0471         if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
0472             return error;
0473         if (XFS_IS_CORRUPT(mp,
0474                    i != 1 ||
0475                    nfbno1 != fbno ||
0476                    nflen1 != flen))
0477             return -EFSCORRUPTED;
0478 #endif
0479     } else {
0480         if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
0481             return error;
0482         if (XFS_IS_CORRUPT(mp, i != 1))
0483             return -EFSCORRUPTED;
0484     }
0485     /*
0486      * Look up the record in the by-block tree if necessary.
0487      */
0488     if (flags & XFSA_FIXUP_BNO_OK) {
0489 #ifdef DEBUG
0490         if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
0491             return error;
0492         if (XFS_IS_CORRUPT(mp,
0493                    i != 1 ||
0494                    nfbno1 != fbno ||
0495                    nflen1 != flen))
0496             return -EFSCORRUPTED;
0497 #endif
0498     } else {
0499         if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
0500             return error;
0501         if (XFS_IS_CORRUPT(mp, i != 1))
0502             return -EFSCORRUPTED;
0503     }
0504 
0505 #ifdef DEBUG
0506     if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
0507         struct xfs_btree_block  *bnoblock;
0508         struct xfs_btree_block  *cntblock;
0509 
0510         bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
0511         cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
0512 
0513         if (XFS_IS_CORRUPT(mp,
0514                    bnoblock->bb_numrecs !=
0515                    cntblock->bb_numrecs))
0516             return -EFSCORRUPTED;
0517     }
0518 #endif
0519 
0520     /*
0521      * Deal with all four cases: the allocated record is contained
0522      * within the freespace record, so we can have new freespace
0523      * at either (or both) end, or no freespace remaining.
0524      */
0525     if (rbno == fbno && rlen == flen)
0526         nfbno1 = nfbno2 = NULLAGBLOCK;
0527     else if (rbno == fbno) {
0528         nfbno1 = rbno + rlen;
0529         nflen1 = flen - rlen;
0530         nfbno2 = NULLAGBLOCK;
0531     } else if (rbno + rlen == fbno + flen) {
0532         nfbno1 = fbno;
0533         nflen1 = flen - rlen;
0534         nfbno2 = NULLAGBLOCK;
0535     } else {
0536         nfbno1 = fbno;
0537         nflen1 = rbno - fbno;
0538         nfbno2 = rbno + rlen;
0539         nflen2 = (fbno + flen) - nfbno2;
0540     }
0541     /*
0542      * Delete the entry from the by-size btree.
0543      */
0544     if ((error = xfs_btree_delete(cnt_cur, &i)))
0545         return error;
0546     if (XFS_IS_CORRUPT(mp, i != 1))
0547         return -EFSCORRUPTED;
0548     /*
0549      * Add new by-size btree entry(s).
0550      */
0551     if (nfbno1 != NULLAGBLOCK) {
0552         if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
0553             return error;
0554         if (XFS_IS_CORRUPT(mp, i != 0))
0555             return -EFSCORRUPTED;
0556         if ((error = xfs_btree_insert(cnt_cur, &i)))
0557             return error;
0558         if (XFS_IS_CORRUPT(mp, i != 1))
0559             return -EFSCORRUPTED;
0560     }
0561     if (nfbno2 != NULLAGBLOCK) {
0562         if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
0563             return error;
0564         if (XFS_IS_CORRUPT(mp, i != 0))
0565             return -EFSCORRUPTED;
0566         if ((error = xfs_btree_insert(cnt_cur, &i)))
0567             return error;
0568         if (XFS_IS_CORRUPT(mp, i != 1))
0569             return -EFSCORRUPTED;
0570     }
0571     /*
0572      * Fix up the by-block btree entry(s).
0573      */
0574     if (nfbno1 == NULLAGBLOCK) {
0575         /*
0576          * No remaining freespace, just delete the by-block tree entry.
0577          */
0578         if ((error = xfs_btree_delete(bno_cur, &i)))
0579             return error;
0580         if (XFS_IS_CORRUPT(mp, i != 1))
0581             return -EFSCORRUPTED;
0582     } else {
0583         /*
0584          * Update the by-block entry to start later|be shorter.
0585          */
0586         if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
0587             return error;
0588     }
0589     if (nfbno2 != NULLAGBLOCK) {
0590         /*
0591          * 2 resulting free entries, need to add one.
0592          */
0593         if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
0594             return error;
0595         if (XFS_IS_CORRUPT(mp, i != 0))
0596             return -EFSCORRUPTED;
0597         if ((error = xfs_btree_insert(bno_cur, &i)))
0598             return error;
0599         if (XFS_IS_CORRUPT(mp, i != 1))
0600             return -EFSCORRUPTED;
0601     }
0602     return 0;
0603 }
0604 
0605 static xfs_failaddr_t
0606 xfs_agfl_verify(
0607     struct xfs_buf  *bp)
0608 {
0609     struct xfs_mount *mp = bp->b_mount;
0610     struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
0611     __be32      *agfl_bno = xfs_buf_to_agfl_bno(bp);
0612     int     i;
0613 
0614     /*
0615      * There is no verification of non-crc AGFLs because mkfs does not
0616      * initialise the AGFL to zero or NULL. Hence the only valid part of the
0617      * AGFL is what the AGF says is active. We can't get to the AGF, so we
0618      * can't verify just those entries are valid.
0619      */
0620     if (!xfs_has_crc(mp))
0621         return NULL;
0622 
0623     if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
0624         return __this_address;
0625     if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
0626         return __this_address;
0627     /*
0628      * during growfs operations, the perag is not fully initialised,
0629      * so we can't use it for any useful checking. growfs ensures we can't
0630      * use it by using uncached buffers that don't have the perag attached
0631      * so we can detect and avoid this problem.
0632      */
0633     if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
0634         return __this_address;
0635 
0636     for (i = 0; i < xfs_agfl_size(mp); i++) {
0637         if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
0638             be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
0639             return __this_address;
0640     }
0641 
0642     if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
0643         return __this_address;
0644     return NULL;
0645 }
0646 
0647 static void
0648 xfs_agfl_read_verify(
0649     struct xfs_buf  *bp)
0650 {
0651     struct xfs_mount *mp = bp->b_mount;
0652     xfs_failaddr_t  fa;
0653 
0654     /*
0655      * There is no verification of non-crc AGFLs because mkfs does not
0656      * initialise the AGFL to zero or NULL. Hence the only valid part of the
0657      * AGFL is what the AGF says is active. We can't get to the AGF, so we
0658      * can't verify just those entries are valid.
0659      */
0660     if (!xfs_has_crc(mp))
0661         return;
0662 
0663     if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
0664         xfs_verifier_error(bp, -EFSBADCRC, __this_address);
0665     else {
0666         fa = xfs_agfl_verify(bp);
0667         if (fa)
0668             xfs_verifier_error(bp, -EFSCORRUPTED, fa);
0669     }
0670 }
0671 
0672 static void
0673 xfs_agfl_write_verify(
0674     struct xfs_buf  *bp)
0675 {
0676     struct xfs_mount    *mp = bp->b_mount;
0677     struct xfs_buf_log_item *bip = bp->b_log_item;
0678     xfs_failaddr_t      fa;
0679 
0680     /* no verification of non-crc AGFLs */
0681     if (!xfs_has_crc(mp))
0682         return;
0683 
0684     fa = xfs_agfl_verify(bp);
0685     if (fa) {
0686         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
0687         return;
0688     }
0689 
0690     if (bip)
0691         XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
0692 
0693     xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
0694 }
0695 
0696 const struct xfs_buf_ops xfs_agfl_buf_ops = {
0697     .name = "xfs_agfl",
0698     .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
0699     .verify_read = xfs_agfl_read_verify,
0700     .verify_write = xfs_agfl_write_verify,
0701     .verify_struct = xfs_agfl_verify,
0702 };
0703 
0704 /*
0705  * Read in the allocation group free block array.
0706  */
0707 int
0708 xfs_alloc_read_agfl(
0709     struct xfs_perag    *pag,
0710     struct xfs_trans    *tp,
0711     struct xfs_buf      **bpp)
0712 {
0713     struct xfs_mount    *mp = pag->pag_mount;
0714     struct xfs_buf      *bp;
0715     int         error;
0716 
0717     error = xfs_trans_read_buf(
0718             mp, tp, mp->m_ddev_targp,
0719             XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
0720             XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
0721     if (error)
0722         return error;
0723     xfs_buf_set_ref(bp, XFS_AGFL_REF);
0724     *bpp = bp;
0725     return 0;
0726 }
0727 
0728 STATIC int
0729 xfs_alloc_update_counters(
0730     struct xfs_trans    *tp,
0731     struct xfs_buf      *agbp,
0732     long            len)
0733 {
0734     struct xfs_agf      *agf = agbp->b_addr;
0735 
0736     agbp->b_pag->pagf_freeblks += len;
0737     be32_add_cpu(&agf->agf_freeblks, len);
0738 
0739     if (unlikely(be32_to_cpu(agf->agf_freeblks) >
0740              be32_to_cpu(agf->agf_length))) {
0741         xfs_buf_mark_corrupt(agbp);
0742         return -EFSCORRUPTED;
0743     }
0744 
0745     xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
0746     return 0;
0747 }
0748 
0749 /*
0750  * Block allocation algorithm and data structures.
0751  */
0752 struct xfs_alloc_cur {
0753     struct xfs_btree_cur        *cnt;   /* btree cursors */
0754     struct xfs_btree_cur        *bnolt;
0755     struct xfs_btree_cur        *bnogt;
0756     xfs_extlen_t            cur_len;/* current search length */
0757     xfs_agblock_t           rec_bno;/* extent startblock */
0758     xfs_extlen_t            rec_len;/* extent length */
0759     xfs_agblock_t           bno;    /* alloc bno */
0760     xfs_extlen_t            len;    /* alloc len */
0761     xfs_extlen_t            diff;   /* diff from search bno */
0762     unsigned int            busy_gen;/* busy state */
0763     bool                busy;
0764 };
0765 
0766 /*
0767  * Set up cursors, etc. in the extent allocation cursor. This function can be
0768  * called multiple times to reset an initialized structure without having to
0769  * reallocate cursors.
0770  */
0771 static int
0772 xfs_alloc_cur_setup(
0773     struct xfs_alloc_arg    *args,
0774     struct xfs_alloc_cur    *acur)
0775 {
0776     int         error;
0777     int         i;
0778 
0779     ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
0780 
0781     acur->cur_len = args->maxlen;
0782     acur->rec_bno = 0;
0783     acur->rec_len = 0;
0784     acur->bno = 0;
0785     acur->len = 0;
0786     acur->diff = -1;
0787     acur->busy = false;
0788     acur->busy_gen = 0;
0789 
0790     /*
0791      * Perform an initial cntbt lookup to check for availability of maxlen
0792      * extents. If this fails, we'll return -ENOSPC to signal the caller to
0793      * attempt a small allocation.
0794      */
0795     if (!acur->cnt)
0796         acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
0797                     args->agbp, args->pag, XFS_BTNUM_CNT);
0798     error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
0799     if (error)
0800         return error;
0801 
0802     /*
0803      * Allocate the bnobt left and right search cursors.
0804      */
0805     if (!acur->bnolt)
0806         acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
0807                     args->agbp, args->pag, XFS_BTNUM_BNO);
0808     if (!acur->bnogt)
0809         acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
0810                     args->agbp, args->pag, XFS_BTNUM_BNO);
0811     return i == 1 ? 0 : -ENOSPC;
0812 }
0813 
0814 static void
0815 xfs_alloc_cur_close(
0816     struct xfs_alloc_cur    *acur,
0817     bool            error)
0818 {
0819     int         cur_error = XFS_BTREE_NOERROR;
0820 
0821     if (error)
0822         cur_error = XFS_BTREE_ERROR;
0823 
0824     if (acur->cnt)
0825         xfs_btree_del_cursor(acur->cnt, cur_error);
0826     if (acur->bnolt)
0827         xfs_btree_del_cursor(acur->bnolt, cur_error);
0828     if (acur->bnogt)
0829         xfs_btree_del_cursor(acur->bnogt, cur_error);
0830     acur->cnt = acur->bnolt = acur->bnogt = NULL;
0831 }
0832 
0833 /*
0834  * Check an extent for allocation and track the best available candidate in the
0835  * allocation structure. The cursor is deactivated if it has entered an out of
0836  * range state based on allocation arguments. Optionally return the extent
0837  * extent geometry and allocation status if requested by the caller.
0838  */
0839 static int
0840 xfs_alloc_cur_check(
0841     struct xfs_alloc_arg    *args,
0842     struct xfs_alloc_cur    *acur,
0843     struct xfs_btree_cur    *cur,
0844     int         *new)
0845 {
0846     int         error, i;
0847     xfs_agblock_t       bno, bnoa, bnew;
0848     xfs_extlen_t        len, lena, diff = -1;
0849     bool            busy;
0850     unsigned        busy_gen = 0;
0851     bool            deactivate = false;
0852     bool            isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
0853 
0854     *new = 0;
0855 
0856     error = xfs_alloc_get_rec(cur, &bno, &len, &i);
0857     if (error)
0858         return error;
0859     if (XFS_IS_CORRUPT(args->mp, i != 1))
0860         return -EFSCORRUPTED;
0861 
0862     /*
0863      * Check minlen and deactivate a cntbt cursor if out of acceptable size
0864      * range (i.e., walking backwards looking for a minlen extent).
0865      */
0866     if (len < args->minlen) {
0867         deactivate = !isbnobt;
0868         goto out;
0869     }
0870 
0871     busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
0872                      &busy_gen);
0873     acur->busy |= busy;
0874     if (busy)
0875         acur->busy_gen = busy_gen;
0876     /* deactivate a bnobt cursor outside of locality range */
0877     if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
0878         deactivate = isbnobt;
0879         goto out;
0880     }
0881     if (lena < args->minlen)
0882         goto out;
0883 
0884     args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
0885     xfs_alloc_fix_len(args);
0886     ASSERT(args->len >= args->minlen);
0887     if (args->len < acur->len)
0888         goto out;
0889 
0890     /*
0891      * We have an aligned record that satisfies minlen and beats or matches
0892      * the candidate extent size. Compare locality for near allocation mode.
0893      */
0894     ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
0895     diff = xfs_alloc_compute_diff(args->agbno, args->len,
0896                       args->alignment, args->datatype,
0897                       bnoa, lena, &bnew);
0898     if (bnew == NULLAGBLOCK)
0899         goto out;
0900 
0901     /*
0902      * Deactivate a bnobt cursor with worse locality than the current best.
0903      */
0904     if (diff > acur->diff) {
0905         deactivate = isbnobt;
0906         goto out;
0907     }
0908 
0909     ASSERT(args->len > acur->len ||
0910            (args->len == acur->len && diff <= acur->diff));
0911     acur->rec_bno = bno;
0912     acur->rec_len = len;
0913     acur->bno = bnew;
0914     acur->len = args->len;
0915     acur->diff = diff;
0916     *new = 1;
0917 
0918     /*
0919      * We're done if we found a perfect allocation. This only deactivates
0920      * the current cursor, but this is just an optimization to terminate a
0921      * cntbt search that otherwise runs to the edge of the tree.
0922      */
0923     if (acur->diff == 0 && acur->len == args->maxlen)
0924         deactivate = true;
0925 out:
0926     if (deactivate)
0927         cur->bc_ag.abt.active = false;
0928     trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
0929                   *new);
0930     return 0;
0931 }
0932 
0933 /*
0934  * Complete an allocation of a candidate extent. Remove the extent from both
0935  * trees and update the args structure.
0936  */
0937 STATIC int
0938 xfs_alloc_cur_finish(
0939     struct xfs_alloc_arg    *args,
0940     struct xfs_alloc_cur    *acur)
0941 {
0942     struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
0943     int         error;
0944 
0945     ASSERT(acur->cnt && acur->bnolt);
0946     ASSERT(acur->bno >= acur->rec_bno);
0947     ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
0948     ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
0949 
0950     error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
0951                       acur->rec_len, acur->bno, acur->len, 0);
0952     if (error)
0953         return error;
0954 
0955     args->agbno = acur->bno;
0956     args->len = acur->len;
0957     args->wasfromfl = 0;
0958 
0959     trace_xfs_alloc_cur(args);
0960     return 0;
0961 }
0962 
0963 /*
0964  * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
0965  * bno optimized lookup to search for extents with ideal size and locality.
0966  */
0967 STATIC int
0968 xfs_alloc_cntbt_iter(
0969     struct xfs_alloc_arg        *args,
0970     struct xfs_alloc_cur        *acur)
0971 {
0972     struct xfs_btree_cur    *cur = acur->cnt;
0973     xfs_agblock_t       bno;
0974     xfs_extlen_t        len, cur_len;
0975     int         error;
0976     int         i;
0977 
0978     if (!xfs_alloc_cur_active(cur))
0979         return 0;
0980 
0981     /* locality optimized lookup */
0982     cur_len = acur->cur_len;
0983     error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
0984     if (error)
0985         return error;
0986     if (i == 0)
0987         return 0;
0988     error = xfs_alloc_get_rec(cur, &bno, &len, &i);
0989     if (error)
0990         return error;
0991 
0992     /* check the current record and update search length from it */
0993     error = xfs_alloc_cur_check(args, acur, cur, &i);
0994     if (error)
0995         return error;
0996     ASSERT(len >= acur->cur_len);
0997     acur->cur_len = len;
0998 
0999     /*
1000      * We looked up the first record >= [agbno, len] above. The agbno is a
1001      * secondary key and so the current record may lie just before or after
1002      * agbno. If it is past agbno, check the previous record too so long as
1003      * the length matches as it may be closer. Don't check a smaller record
1004      * because that could deactivate our cursor.
1005      */
1006     if (bno > args->agbno) {
1007         error = xfs_btree_decrement(cur, 0, &i);
1008         if (!error && i) {
1009             error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1010             if (!error && i && len == acur->cur_len)
1011                 error = xfs_alloc_cur_check(args, acur, cur,
1012                                 &i);
1013         }
1014         if (error)
1015             return error;
1016     }
1017 
1018     /*
1019      * Increment the search key until we find at least one allocation
1020      * candidate or if the extent we found was larger. Otherwise, double the
1021      * search key to optimize the search. Efficiency is more important here
1022      * than absolute best locality.
1023      */
1024     cur_len <<= 1;
1025     if (!acur->len || acur->cur_len >= cur_len)
1026         acur->cur_len++;
1027     else
1028         acur->cur_len = cur_len;
1029 
1030     return error;
1031 }
1032 
1033 /*
1034  * Deal with the case where only small freespaces remain. Either return the
1035  * contents of the last freespace record, or allocate space from the freelist if
1036  * there is nothing in the tree.
1037  */
1038 STATIC int          /* error */
1039 xfs_alloc_ag_vextent_small(
1040     struct xfs_alloc_arg    *args,  /* allocation argument structure */
1041     struct xfs_btree_cur    *ccur,  /* optional by-size cursor */
1042     xfs_agblock_t       *fbnop, /* result block number */
1043     xfs_extlen_t        *flenp, /* result length */
1044     int         *stat)  /* status: 0-freelist, 1-normal/none */
1045 {
1046     struct xfs_agf      *agf = args->agbp->b_addr;
1047     int         error = 0;
1048     xfs_agblock_t       fbno = NULLAGBLOCK;
1049     xfs_extlen_t        flen = 0;
1050     int         i = 0;
1051 
1052     /*
1053      * If a cntbt cursor is provided, try to allocate the largest record in
1054      * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1055      * allocation. Make sure to respect minleft even when pulling from the
1056      * freelist.
1057      */
1058     if (ccur)
1059         error = xfs_btree_decrement(ccur, 0, &i);
1060     if (error)
1061         goto error;
1062     if (i) {
1063         error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1064         if (error)
1065             goto error;
1066         if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1067             error = -EFSCORRUPTED;
1068             goto error;
1069         }
1070         goto out;
1071     }
1072 
1073     if (args->minlen != 1 || args->alignment != 1 ||
1074         args->resv == XFS_AG_RESV_AGFL ||
1075         be32_to_cpu(agf->agf_flcount) <= args->minleft)
1076         goto out;
1077 
1078     error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1079             &fbno, 0);
1080     if (error)
1081         goto error;
1082     if (fbno == NULLAGBLOCK)
1083         goto out;
1084 
1085     xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1086                   (args->datatype & XFS_ALLOC_NOBUSY));
1087 
1088     if (args->datatype & XFS_ALLOC_USERDATA) {
1089         struct xfs_buf  *bp;
1090 
1091         error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1092                 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1093                 args->mp->m_bsize, 0, &bp);
1094         if (error)
1095             goto error;
1096         xfs_trans_binval(args->tp, bp);
1097     }
1098     *fbnop = args->agbno = fbno;
1099     *flenp = args->len = 1;
1100     if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1101         error = -EFSCORRUPTED;
1102         goto error;
1103     }
1104     args->wasfromfl = 1;
1105     trace_xfs_alloc_small_freelist(args);
1106 
1107     /*
1108      * If we're feeding an AGFL block to something that doesn't live in the
1109      * free space, we need to clear out the OWN_AG rmap.
1110      */
1111     error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1112                   &XFS_RMAP_OINFO_AG);
1113     if (error)
1114         goto error;
1115 
1116     *stat = 0;
1117     return 0;
1118 
1119 out:
1120     /*
1121      * Can't do the allocation, give up.
1122      */
1123     if (flen < args->minlen) {
1124         args->agbno = NULLAGBLOCK;
1125         trace_xfs_alloc_small_notenough(args);
1126         flen = 0;
1127     }
1128     *fbnop = fbno;
1129     *flenp = flen;
1130     *stat = 1;
1131     trace_xfs_alloc_small_done(args);
1132     return 0;
1133 
1134 error:
1135     trace_xfs_alloc_small_error(args);
1136     return error;
1137 }
1138 
1139 /*
1140  * Allocate a variable extent in the allocation group agno.
1141  * Type and bno are used to determine where in the allocation group the
1142  * extent will start.
1143  * Extent's length (returned in *len) will be between minlen and maxlen,
1144  * and of the form k * prod + mod unless there's nothing that large.
1145  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1146  */
1147 STATIC int          /* error */
1148 xfs_alloc_ag_vextent(
1149     xfs_alloc_arg_t *args)  /* argument structure for allocation */
1150 {
1151     int     error=0;
1152 
1153     ASSERT(args->minlen > 0);
1154     ASSERT(args->maxlen > 0);
1155     ASSERT(args->minlen <= args->maxlen);
1156     ASSERT(args->mod < args->prod);
1157     ASSERT(args->alignment > 0);
1158 
1159     /*
1160      * Branch to correct routine based on the type.
1161      */
1162     args->wasfromfl = 0;
1163     switch (args->type) {
1164     case XFS_ALLOCTYPE_THIS_AG:
1165         error = xfs_alloc_ag_vextent_size(args);
1166         break;
1167     case XFS_ALLOCTYPE_NEAR_BNO:
1168         error = xfs_alloc_ag_vextent_near(args);
1169         break;
1170     case XFS_ALLOCTYPE_THIS_BNO:
1171         error = xfs_alloc_ag_vextent_exact(args);
1172         break;
1173     default:
1174         ASSERT(0);
1175         /* NOTREACHED */
1176     }
1177 
1178     if (error || args->agbno == NULLAGBLOCK)
1179         return error;
1180 
1181     ASSERT(args->len >= args->minlen);
1182     ASSERT(args->len <= args->maxlen);
1183     ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1184     ASSERT(args->agbno % args->alignment == 0);
1185 
1186     /* if not file data, insert new block into the reverse map btree */
1187     if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1188         error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
1189                        args->agbno, args->len, &args->oinfo);
1190         if (error)
1191             return error;
1192     }
1193 
1194     if (!args->wasfromfl) {
1195         error = xfs_alloc_update_counters(args->tp, args->agbp,
1196                           -((long)(args->len)));
1197         if (error)
1198             return error;
1199 
1200         ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
1201                           args->agbno, args->len));
1202     }
1203 
1204     xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1205 
1206     XFS_STATS_INC(args->mp, xs_allocx);
1207     XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1208     return error;
1209 }
1210 
1211 /*
1212  * Allocate a variable extent at exactly agno/bno.
1213  * Extent's length (returned in *len) will be between minlen and maxlen,
1214  * and of the form k * prod + mod unless there's nothing that large.
1215  * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1216  */
1217 STATIC int          /* error */
1218 xfs_alloc_ag_vextent_exact(
1219     xfs_alloc_arg_t *args)  /* allocation argument structure */
1220 {
1221     struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1222     struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1223     struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1224     int     error;
1225     xfs_agblock_t   fbno;   /* start block of found extent */
1226     xfs_extlen_t    flen;   /* length of found extent */
1227     xfs_agblock_t   tbno;   /* start block of busy extent */
1228     xfs_extlen_t    tlen;   /* length of busy extent */
1229     xfs_agblock_t   tend;   /* end block of busy extent */
1230     int     i;  /* success/failure of operation */
1231     unsigned    busy_gen;
1232 
1233     ASSERT(args->alignment == 1);
1234 
1235     /*
1236      * Allocate/initialize a cursor for the by-number freespace btree.
1237      */
1238     bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1239                       args->pag, XFS_BTNUM_BNO);
1240 
1241     /*
1242      * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1243      * Look for the closest free block <= bno, it must contain bno
1244      * if any free block does.
1245      */
1246     error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1247     if (error)
1248         goto error0;
1249     if (!i)
1250         goto not_found;
1251 
1252     /*
1253      * Grab the freespace record.
1254      */
1255     error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1256     if (error)
1257         goto error0;
1258     if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1259         error = -EFSCORRUPTED;
1260         goto error0;
1261     }
1262     ASSERT(fbno <= args->agbno);
1263 
1264     /*
1265      * Check for overlapping busy extents.
1266      */
1267     tbno = fbno;
1268     tlen = flen;
1269     xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1270 
1271     /*
1272      * Give up if the start of the extent is busy, or the freespace isn't
1273      * long enough for the minimum request.
1274      */
1275     if (tbno > args->agbno)
1276         goto not_found;
1277     if (tlen < args->minlen)
1278         goto not_found;
1279     tend = tbno + tlen;
1280     if (tend < args->agbno + args->minlen)
1281         goto not_found;
1282 
1283     /*
1284      * End of extent will be smaller of the freespace end and the
1285      * maximal requested end.
1286      *
1287      * Fix the length according to mod and prod if given.
1288      */
1289     args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1290                         - args->agbno;
1291     xfs_alloc_fix_len(args);
1292     ASSERT(args->agbno + args->len <= tend);
1293 
1294     /*
1295      * We are allocating agbno for args->len
1296      * Allocate/initialize a cursor for the by-size btree.
1297      */
1298     cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1299                     args->pag, XFS_BTNUM_CNT);
1300     ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1301     error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1302                       args->len, XFSA_FIXUP_BNO_OK);
1303     if (error) {
1304         xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1305         goto error0;
1306     }
1307 
1308     xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1309     xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1310 
1311     args->wasfromfl = 0;
1312     trace_xfs_alloc_exact_done(args);
1313     return 0;
1314 
1315 not_found:
1316     /* Didn't find it, return null. */
1317     xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1318     args->agbno = NULLAGBLOCK;
1319     trace_xfs_alloc_exact_notfound(args);
1320     return 0;
1321 
1322 error0:
1323     xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1324     trace_xfs_alloc_exact_error(args);
1325     return error;
1326 }
1327 
1328 /*
1329  * Search a given number of btree records in a given direction. Check each
1330  * record against the good extent we've already found.
1331  */
1332 STATIC int
1333 xfs_alloc_walk_iter(
1334     struct xfs_alloc_arg    *args,
1335     struct xfs_alloc_cur    *acur,
1336     struct xfs_btree_cur    *cur,
1337     bool            increment,
1338     bool            find_one, /* quit on first candidate */
1339     int         count,    /* rec count (-1 for infinite) */
1340     int         *stat)
1341 {
1342     int         error;
1343     int         i;
1344 
1345     *stat = 0;
1346 
1347     /*
1348      * Search so long as the cursor is active or we find a better extent.
1349      * The cursor is deactivated if it extends beyond the range of the
1350      * current allocation candidate.
1351      */
1352     while (xfs_alloc_cur_active(cur) && count) {
1353         error = xfs_alloc_cur_check(args, acur, cur, &i);
1354         if (error)
1355             return error;
1356         if (i == 1) {
1357             *stat = 1;
1358             if (find_one)
1359                 break;
1360         }
1361         if (!xfs_alloc_cur_active(cur))
1362             break;
1363 
1364         if (increment)
1365             error = xfs_btree_increment(cur, 0, &i);
1366         else
1367             error = xfs_btree_decrement(cur, 0, &i);
1368         if (error)
1369             return error;
1370         if (i == 0)
1371             cur->bc_ag.abt.active = false;
1372 
1373         if (count > 0)
1374             count--;
1375     }
1376 
1377     return 0;
1378 }
1379 
1380 /*
1381  * Search the by-bno and by-size btrees in parallel in search of an extent with
1382  * ideal locality based on the NEAR mode ->agbno locality hint.
1383  */
1384 STATIC int
1385 xfs_alloc_ag_vextent_locality(
1386     struct xfs_alloc_arg    *args,
1387     struct xfs_alloc_cur    *acur,
1388     int         *stat)
1389 {
1390     struct xfs_btree_cur    *fbcur = NULL;
1391     int         error;
1392     int         i;
1393     bool            fbinc;
1394 
1395     ASSERT(acur->len == 0);
1396     ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1397 
1398     *stat = 0;
1399 
1400     error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1401     if (error)
1402         return error;
1403     error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1404     if (error)
1405         return error;
1406     error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1407     if (error)
1408         return error;
1409 
1410     /*
1411      * Search the bnobt and cntbt in parallel. Search the bnobt left and
1412      * right and lookup the closest extent to the locality hint for each
1413      * extent size key in the cntbt. The entire search terminates
1414      * immediately on a bnobt hit because that means we've found best case
1415      * locality. Otherwise the search continues until the cntbt cursor runs
1416      * off the end of the tree. If no allocation candidate is found at this
1417      * point, give up on locality, walk backwards from the end of the cntbt
1418      * and take the first available extent.
1419      *
1420      * The parallel tree searches balance each other out to provide fairly
1421      * consistent performance for various situations. The bnobt search can
1422      * have pathological behavior in the worst case scenario of larger
1423      * allocation requests and fragmented free space. On the other hand, the
1424      * bnobt is able to satisfy most smaller allocation requests much more
1425      * quickly than the cntbt. The cntbt search can sift through fragmented
1426      * free space and sets of free extents for larger allocation requests
1427      * more quickly than the bnobt. Since the locality hint is just a hint
1428      * and we don't want to scan the entire bnobt for perfect locality, the
1429      * cntbt search essentially bounds the bnobt search such that we can
1430      * find good enough locality at reasonable performance in most cases.
1431      */
1432     while (xfs_alloc_cur_active(acur->bnolt) ||
1433            xfs_alloc_cur_active(acur->bnogt) ||
1434            xfs_alloc_cur_active(acur->cnt)) {
1435 
1436         trace_xfs_alloc_cur_lookup(args);
1437 
1438         /*
1439          * Search the bnobt left and right. In the case of a hit, finish
1440          * the search in the opposite direction and we're done.
1441          */
1442         error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1443                         true, 1, &i);
1444         if (error)
1445             return error;
1446         if (i == 1) {
1447             trace_xfs_alloc_cur_left(args);
1448             fbcur = acur->bnogt;
1449             fbinc = true;
1450             break;
1451         }
1452         error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1453                         1, &i);
1454         if (error)
1455             return error;
1456         if (i == 1) {
1457             trace_xfs_alloc_cur_right(args);
1458             fbcur = acur->bnolt;
1459             fbinc = false;
1460             break;
1461         }
1462 
1463         /*
1464          * Check the extent with best locality based on the current
1465          * extent size search key and keep track of the best candidate.
1466          */
1467         error = xfs_alloc_cntbt_iter(args, acur);
1468         if (error)
1469             return error;
1470         if (!xfs_alloc_cur_active(acur->cnt)) {
1471             trace_xfs_alloc_cur_lookup_done(args);
1472             break;
1473         }
1474     }
1475 
1476     /*
1477      * If we failed to find anything due to busy extents, return empty
1478      * handed so the caller can flush and retry. If no busy extents were
1479      * found, walk backwards from the end of the cntbt as a last resort.
1480      */
1481     if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1482         error = xfs_btree_decrement(acur->cnt, 0, &i);
1483         if (error)
1484             return error;
1485         if (i) {
1486             acur->cnt->bc_ag.abt.active = true;
1487             fbcur = acur->cnt;
1488             fbinc = false;
1489         }
1490     }
1491 
1492     /*
1493      * Search in the opposite direction for a better entry in the case of
1494      * a bnobt hit or walk backwards from the end of the cntbt.
1495      */
1496     if (fbcur) {
1497         error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1498                         &i);
1499         if (error)
1500             return error;
1501     }
1502 
1503     if (acur->len)
1504         *stat = 1;
1505 
1506     return 0;
1507 }
1508 
1509 /* Check the last block of the cnt btree for allocations. */
1510 static int
1511 xfs_alloc_ag_vextent_lastblock(
1512     struct xfs_alloc_arg    *args,
1513     struct xfs_alloc_cur    *acur,
1514     xfs_agblock_t       *bno,
1515     xfs_extlen_t        *len,
1516     bool            *allocated)
1517 {
1518     int         error;
1519     int         i;
1520 
1521 #ifdef DEBUG
1522     /* Randomly don't execute the first algorithm. */
1523     if (prandom_u32() & 1)
1524         return 0;
1525 #endif
1526 
1527     /*
1528      * Start from the entry that lookup found, sequence through all larger
1529      * free blocks.  If we're actually pointing at a record smaller than
1530      * maxlen, go to the start of this block, and skip all those smaller
1531      * than minlen.
1532      */
1533     if (*len || args->alignment > 1) {
1534         acur->cnt->bc_levels[0].ptr = 1;
1535         do {
1536             error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1537             if (error)
1538                 return error;
1539             if (XFS_IS_CORRUPT(args->mp, i != 1))
1540                 return -EFSCORRUPTED;
1541             if (*len >= args->minlen)
1542                 break;
1543             error = xfs_btree_increment(acur->cnt, 0, &i);
1544             if (error)
1545                 return error;
1546         } while (i);
1547         ASSERT(*len >= args->minlen);
1548         if (!i)
1549             return 0;
1550     }
1551 
1552     error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1553     if (error)
1554         return error;
1555 
1556     /*
1557      * It didn't work.  We COULD be in a case where there's a good record
1558      * somewhere, so try again.
1559      */
1560     if (acur->len == 0)
1561         return 0;
1562 
1563     trace_xfs_alloc_near_first(args);
1564     *allocated = true;
1565     return 0;
1566 }
1567 
1568 /*
1569  * Allocate a variable extent near bno in the allocation group agno.
1570  * Extent's length (returned in len) will be between minlen and maxlen,
1571  * and of the form k * prod + mod unless there's nothing that large.
1572  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1573  */
1574 STATIC int
1575 xfs_alloc_ag_vextent_near(
1576     struct xfs_alloc_arg    *args)
1577 {
1578     struct xfs_alloc_cur    acur = {};
1579     int         error;      /* error code */
1580     int         i;      /* result code, temporary */
1581     xfs_agblock_t       bno;
1582     xfs_extlen_t        len;
1583 
1584     /* handle uninitialized agbno range so caller doesn't have to */
1585     if (!args->min_agbno && !args->max_agbno)
1586         args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1587     ASSERT(args->min_agbno <= args->max_agbno);
1588 
1589     /* clamp agbno to the range if it's outside */
1590     if (args->agbno < args->min_agbno)
1591         args->agbno = args->min_agbno;
1592     if (args->agbno > args->max_agbno)
1593         args->agbno = args->max_agbno;
1594 
1595 restart:
1596     len = 0;
1597 
1598     /*
1599      * Set up cursors and see if there are any free extents as big as
1600      * maxlen. If not, pick the last entry in the tree unless the tree is
1601      * empty.
1602      */
1603     error = xfs_alloc_cur_setup(args, &acur);
1604     if (error == -ENOSPC) {
1605         error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1606                 &len, &i);
1607         if (error)
1608             goto out;
1609         if (i == 0 || len == 0) {
1610             trace_xfs_alloc_near_noentry(args);
1611             goto out;
1612         }
1613         ASSERT(i == 1);
1614     } else if (error) {
1615         goto out;
1616     }
1617 
1618     /*
1619      * First algorithm.
1620      * If the requested extent is large wrt the freespaces available
1621      * in this a.g., then the cursor will be pointing to a btree entry
1622      * near the right edge of the tree.  If it's in the last btree leaf
1623      * block, then we just examine all the entries in that block
1624      * that are big enough, and pick the best one.
1625      */
1626     if (xfs_btree_islastblock(acur.cnt, 0)) {
1627         bool        allocated = false;
1628 
1629         error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1630                 &allocated);
1631         if (error)
1632             goto out;
1633         if (allocated)
1634             goto alloc_finish;
1635     }
1636 
1637     /*
1638      * Second algorithm. Combined cntbt and bnobt search to find ideal
1639      * locality.
1640      */
1641     error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1642     if (error)
1643         goto out;
1644 
1645     /*
1646      * If we couldn't get anything, give up.
1647      */
1648     if (!acur.len) {
1649         if (acur.busy) {
1650             trace_xfs_alloc_near_busy(args);
1651             xfs_extent_busy_flush(args->mp, args->pag,
1652                           acur.busy_gen);
1653             goto restart;
1654         }
1655         trace_xfs_alloc_size_neither(args);
1656         args->agbno = NULLAGBLOCK;
1657         goto out;
1658     }
1659 
1660 alloc_finish:
1661     /* fix up btrees on a successful allocation */
1662     error = xfs_alloc_cur_finish(args, &acur);
1663 
1664 out:
1665     xfs_alloc_cur_close(&acur, error);
1666     return error;
1667 }
1668 
1669 /*
1670  * Allocate a variable extent anywhere in the allocation group agno.
1671  * Extent's length (returned in len) will be between minlen and maxlen,
1672  * and of the form k * prod + mod unless there's nothing that large.
1673  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1674  */
1675 STATIC int              /* error */
1676 xfs_alloc_ag_vextent_size(
1677     xfs_alloc_arg_t *args)      /* allocation argument structure */
1678 {
1679     struct xfs_agf  *agf = args->agbp->b_addr;
1680     struct xfs_btree_cur *bno_cur;  /* cursor for bno btree */
1681     struct xfs_btree_cur *cnt_cur;  /* cursor for cnt btree */
1682     int     error;      /* error result */
1683     xfs_agblock_t   fbno;       /* start of found freespace */
1684     xfs_extlen_t    flen;       /* length of found freespace */
1685     int     i;      /* temp status variable */
1686     xfs_agblock_t   rbno;       /* returned block number */
1687     xfs_extlen_t    rlen;       /* length of returned extent */
1688     bool        busy;
1689     unsigned    busy_gen;
1690 
1691 restart:
1692     /*
1693      * Allocate and initialize a cursor for the by-size btree.
1694      */
1695     cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1696                     args->pag, XFS_BTNUM_CNT);
1697     bno_cur = NULL;
1698 
1699     /*
1700      * Look for an entry >= maxlen+alignment-1 blocks.
1701      */
1702     if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1703             args->maxlen + args->alignment - 1, &i)))
1704         goto error0;
1705 
1706     /*
1707      * If none then we have to settle for a smaller extent. In the case that
1708      * there are no large extents, this will return the last entry in the
1709      * tree unless the tree is empty. In the case that there are only busy
1710      * large extents, this will return the largest small extent unless there
1711      * are no smaller extents available.
1712      */
1713     if (!i) {
1714         error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1715                            &fbno, &flen, &i);
1716         if (error)
1717             goto error0;
1718         if (i == 0 || flen == 0) {
1719             xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1720             trace_xfs_alloc_size_noentry(args);
1721             return 0;
1722         }
1723         ASSERT(i == 1);
1724         busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1725                 &rlen, &busy_gen);
1726     } else {
1727         /*
1728          * Search for a non-busy extent that is large enough.
1729          */
1730         for (;;) {
1731             error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1732             if (error)
1733                 goto error0;
1734             if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1735                 error = -EFSCORRUPTED;
1736                 goto error0;
1737             }
1738 
1739             busy = xfs_alloc_compute_aligned(args, fbno, flen,
1740                     &rbno, &rlen, &busy_gen);
1741 
1742             if (rlen >= args->maxlen)
1743                 break;
1744 
1745             error = xfs_btree_increment(cnt_cur, 0, &i);
1746             if (error)
1747                 goto error0;
1748             if (i == 0) {
1749                 /*
1750                  * Our only valid extents must have been busy.
1751                  * Make it unbusy by forcing the log out and
1752                  * retrying.
1753                  */
1754                 xfs_btree_del_cursor(cnt_cur,
1755                              XFS_BTREE_NOERROR);
1756                 trace_xfs_alloc_size_busy(args);
1757                 xfs_extent_busy_flush(args->mp,
1758                             args->pag, busy_gen);
1759                 goto restart;
1760             }
1761         }
1762     }
1763 
1764     /*
1765      * In the first case above, we got the last entry in the
1766      * by-size btree.  Now we check to see if the space hits maxlen
1767      * once aligned; if not, we search left for something better.
1768      * This can't happen in the second case above.
1769      */
1770     rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1771     if (XFS_IS_CORRUPT(args->mp,
1772                rlen != 0 &&
1773                (rlen > flen ||
1774                 rbno + rlen > fbno + flen))) {
1775         error = -EFSCORRUPTED;
1776         goto error0;
1777     }
1778     if (rlen < args->maxlen) {
1779         xfs_agblock_t   bestfbno;
1780         xfs_extlen_t    bestflen;
1781         xfs_agblock_t   bestrbno;
1782         xfs_extlen_t    bestrlen;
1783 
1784         bestrlen = rlen;
1785         bestrbno = rbno;
1786         bestflen = flen;
1787         bestfbno = fbno;
1788         for (;;) {
1789             if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1790                 goto error0;
1791             if (i == 0)
1792                 break;
1793             if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1794                     &i)))
1795                 goto error0;
1796             if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1797                 error = -EFSCORRUPTED;
1798                 goto error0;
1799             }
1800             if (flen < bestrlen)
1801                 break;
1802             busy = xfs_alloc_compute_aligned(args, fbno, flen,
1803                     &rbno, &rlen, &busy_gen);
1804             rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1805             if (XFS_IS_CORRUPT(args->mp,
1806                        rlen != 0 &&
1807                        (rlen > flen ||
1808                         rbno + rlen > fbno + flen))) {
1809                 error = -EFSCORRUPTED;
1810                 goto error0;
1811             }
1812             if (rlen > bestrlen) {
1813                 bestrlen = rlen;
1814                 bestrbno = rbno;
1815                 bestflen = flen;
1816                 bestfbno = fbno;
1817                 if (rlen == args->maxlen)
1818                     break;
1819             }
1820         }
1821         if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1822                 &i)))
1823             goto error0;
1824         if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1825             error = -EFSCORRUPTED;
1826             goto error0;
1827         }
1828         rlen = bestrlen;
1829         rbno = bestrbno;
1830         flen = bestflen;
1831         fbno = bestfbno;
1832     }
1833     args->wasfromfl = 0;
1834     /*
1835      * Fix up the length.
1836      */
1837     args->len = rlen;
1838     if (rlen < args->minlen) {
1839         if (busy) {
1840             xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1841             trace_xfs_alloc_size_busy(args);
1842             xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1843             goto restart;
1844         }
1845         goto out_nominleft;
1846     }
1847     xfs_alloc_fix_len(args);
1848 
1849     rlen = args->len;
1850     if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1851         error = -EFSCORRUPTED;
1852         goto error0;
1853     }
1854     /*
1855      * Allocate and initialize a cursor for the by-block tree.
1856      */
1857     bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1858                     args->pag, XFS_BTNUM_BNO);
1859     if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1860             rbno, rlen, XFSA_FIXUP_CNT_OK)))
1861         goto error0;
1862     xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1863     xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1864     cnt_cur = bno_cur = NULL;
1865     args->len = rlen;
1866     args->agbno = rbno;
1867     if (XFS_IS_CORRUPT(args->mp,
1868                args->agbno + args->len >
1869                be32_to_cpu(agf->agf_length))) {
1870         error = -EFSCORRUPTED;
1871         goto error0;
1872     }
1873     trace_xfs_alloc_size_done(args);
1874     return 0;
1875 
1876 error0:
1877     trace_xfs_alloc_size_error(args);
1878     if (cnt_cur)
1879         xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1880     if (bno_cur)
1881         xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1882     return error;
1883 
1884 out_nominleft:
1885     xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1886     trace_xfs_alloc_size_nominleft(args);
1887     args->agbno = NULLAGBLOCK;
1888     return 0;
1889 }
1890 
1891 /*
1892  * Free the extent starting at agno/bno for length.
1893  */
1894 STATIC int
1895 xfs_free_ag_extent(
1896     struct xfs_trans        *tp,
1897     struct xfs_buf          *agbp,
1898     xfs_agnumber_t          agno,
1899     xfs_agblock_t           bno,
1900     xfs_extlen_t            len,
1901     const struct xfs_owner_info *oinfo,
1902     enum xfs_ag_resv_type       type)
1903 {
1904     struct xfs_mount        *mp;
1905     struct xfs_btree_cur        *bno_cur;
1906     struct xfs_btree_cur        *cnt_cur;
1907     xfs_agblock_t           gtbno; /* start of right neighbor */
1908     xfs_extlen_t            gtlen; /* length of right neighbor */
1909     xfs_agblock_t           ltbno; /* start of left neighbor */
1910     xfs_extlen_t            ltlen; /* length of left neighbor */
1911     xfs_agblock_t           nbno; /* new starting block of freesp */
1912     xfs_extlen_t            nlen; /* new length of freespace */
1913     int             haveleft; /* have a left neighbor */
1914     int             haveright; /* have a right neighbor */
1915     int             i;
1916     int             error;
1917     struct xfs_perag        *pag = agbp->b_pag;
1918 
1919     bno_cur = cnt_cur = NULL;
1920     mp = tp->t_mountp;
1921 
1922     if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1923         error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1924         if (error)
1925             goto error0;
1926     }
1927 
1928     /*
1929      * Allocate and initialize a cursor for the by-block btree.
1930      */
1931     bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
1932     /*
1933      * Look for a neighboring block on the left (lower block numbers)
1934      * that is contiguous with this space.
1935      */
1936     if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1937         goto error0;
1938     if (haveleft) {
1939         /*
1940          * There is a block to our left.
1941          */
1942         if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1943             goto error0;
1944         if (XFS_IS_CORRUPT(mp, i != 1)) {
1945             error = -EFSCORRUPTED;
1946             goto error0;
1947         }
1948         /*
1949          * It's not contiguous, though.
1950          */
1951         if (ltbno + ltlen < bno)
1952             haveleft = 0;
1953         else {
1954             /*
1955              * If this failure happens the request to free this
1956              * space was invalid, it's (partly) already free.
1957              * Very bad.
1958              */
1959             if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1960                 error = -EFSCORRUPTED;
1961                 goto error0;
1962             }
1963         }
1964     }
1965     /*
1966      * Look for a neighboring block on the right (higher block numbers)
1967      * that is contiguous with this space.
1968      */
1969     if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1970         goto error0;
1971     if (haveright) {
1972         /*
1973          * There is a block to our right.
1974          */
1975         if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1976             goto error0;
1977         if (XFS_IS_CORRUPT(mp, i != 1)) {
1978             error = -EFSCORRUPTED;
1979             goto error0;
1980         }
1981         /*
1982          * It's not contiguous, though.
1983          */
1984         if (bno + len < gtbno)
1985             haveright = 0;
1986         else {
1987             /*
1988              * If this failure happens the request to free this
1989              * space was invalid, it's (partly) already free.
1990              * Very bad.
1991              */
1992             if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1993                 error = -EFSCORRUPTED;
1994                 goto error0;
1995             }
1996         }
1997     }
1998     /*
1999      * Now allocate and initialize a cursor for the by-size tree.
2000      */
2001     cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
2002     /*
2003      * Have both left and right contiguous neighbors.
2004      * Merge all three into a single free block.
2005      */
2006     if (haveleft && haveright) {
2007         /*
2008          * Delete the old by-size entry on the left.
2009          */
2010         if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2011             goto error0;
2012         if (XFS_IS_CORRUPT(mp, i != 1)) {
2013             error = -EFSCORRUPTED;
2014             goto error0;
2015         }
2016         if ((error = xfs_btree_delete(cnt_cur, &i)))
2017             goto error0;
2018         if (XFS_IS_CORRUPT(mp, i != 1)) {
2019             error = -EFSCORRUPTED;
2020             goto error0;
2021         }
2022         /*
2023          * Delete the old by-size entry on the right.
2024          */
2025         if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2026             goto error0;
2027         if (XFS_IS_CORRUPT(mp, i != 1)) {
2028             error = -EFSCORRUPTED;
2029             goto error0;
2030         }
2031         if ((error = xfs_btree_delete(cnt_cur, &i)))
2032             goto error0;
2033         if (XFS_IS_CORRUPT(mp, i != 1)) {
2034             error = -EFSCORRUPTED;
2035             goto error0;
2036         }
2037         /*
2038          * Delete the old by-block entry for the right block.
2039          */
2040         if ((error = xfs_btree_delete(bno_cur, &i)))
2041             goto error0;
2042         if (XFS_IS_CORRUPT(mp, i != 1)) {
2043             error = -EFSCORRUPTED;
2044             goto error0;
2045         }
2046         /*
2047          * Move the by-block cursor back to the left neighbor.
2048          */
2049         if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2050             goto error0;
2051         if (XFS_IS_CORRUPT(mp, i != 1)) {
2052             error = -EFSCORRUPTED;
2053             goto error0;
2054         }
2055 #ifdef DEBUG
2056         /*
2057          * Check that this is the right record: delete didn't
2058          * mangle the cursor.
2059          */
2060         {
2061             xfs_agblock_t   xxbno;
2062             xfs_extlen_t    xxlen;
2063 
2064             if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2065                     &i)))
2066                 goto error0;
2067             if (XFS_IS_CORRUPT(mp,
2068                        i != 1 ||
2069                        xxbno != ltbno ||
2070                        xxlen != ltlen)) {
2071                 error = -EFSCORRUPTED;
2072                 goto error0;
2073             }
2074         }
2075 #endif
2076         /*
2077          * Update remaining by-block entry to the new, joined block.
2078          */
2079         nbno = ltbno;
2080         nlen = len + ltlen + gtlen;
2081         if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2082             goto error0;
2083     }
2084     /*
2085      * Have only a left contiguous neighbor.
2086      * Merge it together with the new freespace.
2087      */
2088     else if (haveleft) {
2089         /*
2090          * Delete the old by-size entry on the left.
2091          */
2092         if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2093             goto error0;
2094         if (XFS_IS_CORRUPT(mp, i != 1)) {
2095             error = -EFSCORRUPTED;
2096             goto error0;
2097         }
2098         if ((error = xfs_btree_delete(cnt_cur, &i)))
2099             goto error0;
2100         if (XFS_IS_CORRUPT(mp, i != 1)) {
2101             error = -EFSCORRUPTED;
2102             goto error0;
2103         }
2104         /*
2105          * Back up the by-block cursor to the left neighbor, and
2106          * update its length.
2107          */
2108         if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2109             goto error0;
2110         if (XFS_IS_CORRUPT(mp, i != 1)) {
2111             error = -EFSCORRUPTED;
2112             goto error0;
2113         }
2114         nbno = ltbno;
2115         nlen = len + ltlen;
2116         if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2117             goto error0;
2118     }
2119     /*
2120      * Have only a right contiguous neighbor.
2121      * Merge it together with the new freespace.
2122      */
2123     else if (haveright) {
2124         /*
2125          * Delete the old by-size entry on the right.
2126          */
2127         if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2128             goto error0;
2129         if (XFS_IS_CORRUPT(mp, i != 1)) {
2130             error = -EFSCORRUPTED;
2131             goto error0;
2132         }
2133         if ((error = xfs_btree_delete(cnt_cur, &i)))
2134             goto error0;
2135         if (XFS_IS_CORRUPT(mp, i != 1)) {
2136             error = -EFSCORRUPTED;
2137             goto error0;
2138         }
2139         /*
2140          * Update the starting block and length of the right
2141          * neighbor in the by-block tree.
2142          */
2143         nbno = bno;
2144         nlen = len + gtlen;
2145         if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2146             goto error0;
2147     }
2148     /*
2149      * No contiguous neighbors.
2150      * Insert the new freespace into the by-block tree.
2151      */
2152     else {
2153         nbno = bno;
2154         nlen = len;
2155         if ((error = xfs_btree_insert(bno_cur, &i)))
2156             goto error0;
2157         if (XFS_IS_CORRUPT(mp, i != 1)) {
2158             error = -EFSCORRUPTED;
2159             goto error0;
2160         }
2161     }
2162     xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2163     bno_cur = NULL;
2164     /*
2165      * In all cases we need to insert the new freespace in the by-size tree.
2166      */
2167     if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2168         goto error0;
2169     if (XFS_IS_CORRUPT(mp, i != 0)) {
2170         error = -EFSCORRUPTED;
2171         goto error0;
2172     }
2173     if ((error = xfs_btree_insert(cnt_cur, &i)))
2174         goto error0;
2175     if (XFS_IS_CORRUPT(mp, i != 1)) {
2176         error = -EFSCORRUPTED;
2177         goto error0;
2178     }
2179     xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2180     cnt_cur = NULL;
2181 
2182     /*
2183      * Update the freespace totals in the ag and superblock.
2184      */
2185     error = xfs_alloc_update_counters(tp, agbp, len);
2186     xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2187     if (error)
2188         goto error0;
2189 
2190     XFS_STATS_INC(mp, xs_freex);
2191     XFS_STATS_ADD(mp, xs_freeb, len);
2192 
2193     trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2194 
2195     return 0;
2196 
2197  error0:
2198     trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2199     if (bno_cur)
2200         xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2201     if (cnt_cur)
2202         xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2203     return error;
2204 }
2205 
2206 /*
2207  * Visible (exported) allocation/free functions.
2208  * Some of these are used just by xfs_alloc_btree.c and this file.
2209  */
2210 
2211 /*
2212  * Compute and fill in value of m_alloc_maxlevels.
2213  */
2214 void
2215 xfs_alloc_compute_maxlevels(
2216     xfs_mount_t *mp)    /* file system mount structure */
2217 {
2218     mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2219             (mp->m_sb.sb_agblocks + 1) / 2);
2220     ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2221 }
2222 
2223 /*
2224  * Find the length of the longest extent in an AG.  The 'need' parameter
2225  * specifies how much space we're going to need for the AGFL and the
2226  * 'reserved' parameter tells us how many blocks in this AG are reserved for
2227  * other callers.
2228  */
2229 xfs_extlen_t
2230 xfs_alloc_longest_free_extent(
2231     struct xfs_perag    *pag,
2232     xfs_extlen_t        need,
2233     xfs_extlen_t        reserved)
2234 {
2235     xfs_extlen_t        delta = 0;
2236 
2237     /*
2238      * If the AGFL needs a recharge, we'll have to subtract that from the
2239      * longest extent.
2240      */
2241     if (need > pag->pagf_flcount)
2242         delta = need - pag->pagf_flcount;
2243 
2244     /*
2245      * If we cannot maintain others' reservations with space from the
2246      * not-longest freesp extents, we'll have to subtract /that/ from
2247      * the longest extent too.
2248      */
2249     if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2250         delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2251 
2252     /*
2253      * If the longest extent is long enough to satisfy all the
2254      * reservations and AGFL rules in place, we can return this extent.
2255      */
2256     if (pag->pagf_longest > delta)
2257         return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2258                 pag->pagf_longest - delta);
2259 
2260     /* Otherwise, let the caller try for 1 block if there's space. */
2261     return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2262 }
2263 
2264 /*
2265  * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2266  * return the largest possible minimum length.
2267  */
2268 unsigned int
2269 xfs_alloc_min_freelist(
2270     struct xfs_mount    *mp,
2271     struct xfs_perag    *pag)
2272 {
2273     /* AG btrees have at least 1 level. */
2274     static const uint8_t    fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2275     const uint8_t       *levels = pag ? pag->pagf_levels : fake_levels;
2276     unsigned int        min_free;
2277 
2278     ASSERT(mp->m_alloc_maxlevels > 0);
2279 
2280     /* space needed by-bno freespace btree */
2281     min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2282                        mp->m_alloc_maxlevels);
2283     /* space needed by-size freespace btree */
2284     min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2285                        mp->m_alloc_maxlevels);
2286     /* space needed reverse mapping used space btree */
2287     if (xfs_has_rmapbt(mp))
2288         min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2289                         mp->m_rmap_maxlevels);
2290 
2291     return min_free;
2292 }
2293 
2294 /*
2295  * Check if the operation we are fixing up the freelist for should go ahead or
2296  * not. If we are freeing blocks, we always allow it, otherwise the allocation
2297  * is dependent on whether the size and shape of free space available will
2298  * permit the requested allocation to take place.
2299  */
2300 static bool
2301 xfs_alloc_space_available(
2302     struct xfs_alloc_arg    *args,
2303     xfs_extlen_t        min_free,
2304     int         flags)
2305 {
2306     struct xfs_perag    *pag = args->pag;
2307     xfs_extlen_t        alloc_len, longest;
2308     xfs_extlen_t        reservation; /* blocks that are still reserved */
2309     int         available;
2310     xfs_extlen_t        agflcount;
2311 
2312     if (flags & XFS_ALLOC_FLAG_FREEING)
2313         return true;
2314 
2315     reservation = xfs_ag_resv_needed(pag, args->resv);
2316 
2317     /* do we have enough contiguous free space for the allocation? */
2318     alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2319     longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2320     if (longest < alloc_len)
2321         return false;
2322 
2323     /*
2324      * Do we have enough free space remaining for the allocation? Don't
2325      * account extra agfl blocks because we are about to defer free them,
2326      * making them unavailable until the current transaction commits.
2327      */
2328     agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2329     available = (int)(pag->pagf_freeblks + agflcount -
2330               reservation - min_free - args->minleft);
2331     if (available < (int)max(args->total, alloc_len))
2332         return false;
2333 
2334     /*
2335      * Clamp maxlen to the amount of free space available for the actual
2336      * extent allocation.
2337      */
2338     if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2339         args->maxlen = available;
2340         ASSERT(args->maxlen > 0);
2341         ASSERT(args->maxlen >= args->minlen);
2342     }
2343 
2344     return true;
2345 }
2346 
2347 int
2348 xfs_free_agfl_block(
2349     struct xfs_trans    *tp,
2350     xfs_agnumber_t      agno,
2351     xfs_agblock_t       agbno,
2352     struct xfs_buf      *agbp,
2353     struct xfs_owner_info   *oinfo)
2354 {
2355     int         error;
2356     struct xfs_buf      *bp;
2357 
2358     error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2359                    XFS_AG_RESV_AGFL);
2360     if (error)
2361         return error;
2362 
2363     error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2364             XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2365             tp->t_mountp->m_bsize, 0, &bp);
2366     if (error)
2367         return error;
2368     xfs_trans_binval(tp, bp);
2369 
2370     return 0;
2371 }
2372 
2373 /*
2374  * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2375  * is to detect an agfl header padding mismatch between current and early v5
2376  * kernels. This problem manifests as a 1-slot size difference between the
2377  * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2378  * may also catch variants of agfl count corruption unrelated to padding. Either
2379  * way, we'll reset the agfl and warn the user.
2380  *
2381  * Return true if a reset is required before the agfl can be used, false
2382  * otherwise.
2383  */
2384 static bool
2385 xfs_agfl_needs_reset(
2386     struct xfs_mount    *mp,
2387     struct xfs_agf      *agf)
2388 {
2389     uint32_t        f = be32_to_cpu(agf->agf_flfirst);
2390     uint32_t        l = be32_to_cpu(agf->agf_fllast);
2391     uint32_t        c = be32_to_cpu(agf->agf_flcount);
2392     int         agfl_size = xfs_agfl_size(mp);
2393     int         active;
2394 
2395     /* no agfl header on v4 supers */
2396     if (!xfs_has_crc(mp))
2397         return false;
2398 
2399     /*
2400      * The agf read verifier catches severe corruption of these fields.
2401      * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2402      * the verifier allows it.
2403      */
2404     if (f >= agfl_size || l >= agfl_size)
2405         return true;
2406     if (c > agfl_size)
2407         return true;
2408 
2409     /*
2410      * Check consistency between the on-disk count and the active range. An
2411      * agfl padding mismatch manifests as an inconsistent flcount.
2412      */
2413     if (c && l >= f)
2414         active = l - f + 1;
2415     else if (c)
2416         active = agfl_size - f + l + 1;
2417     else
2418         active = 0;
2419 
2420     return active != c;
2421 }
2422 
2423 /*
2424  * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2425  * agfl content cannot be trusted. Warn the user that a repair is required to
2426  * recover leaked blocks.
2427  *
2428  * The purpose of this mechanism is to handle filesystems affected by the agfl
2429  * header padding mismatch problem. A reset keeps the filesystem online with a
2430  * relatively minor free space accounting inconsistency rather than suffer the
2431  * inevitable crash from use of an invalid agfl block.
2432  */
2433 static void
2434 xfs_agfl_reset(
2435     struct xfs_trans    *tp,
2436     struct xfs_buf      *agbp,
2437     struct xfs_perag    *pag)
2438 {
2439     struct xfs_mount    *mp = tp->t_mountp;
2440     struct xfs_agf      *agf = agbp->b_addr;
2441 
2442     ASSERT(pag->pagf_agflreset);
2443     trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2444 
2445     xfs_warn(mp,
2446            "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2447            "Please unmount and run xfs_repair.",
2448              pag->pag_agno, pag->pagf_flcount);
2449 
2450     agf->agf_flfirst = 0;
2451     agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2452     agf->agf_flcount = 0;
2453     xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2454                     XFS_AGF_FLCOUNT);
2455 
2456     pag->pagf_flcount = 0;
2457     pag->pagf_agflreset = false;
2458 }
2459 
2460 /*
2461  * Defer an AGFL block free. This is effectively equivalent to
2462  * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2463  *
2464  * Deferring AGFL frees helps prevent log reservation overruns due to too many
2465  * allocation operations in a transaction. AGFL frees are prone to this problem
2466  * because for one they are always freed one at a time. Further, an immediate
2467  * AGFL block free can cause a btree join and require another block free before
2468  * the real allocation can proceed. Deferring the free disconnects freeing up
2469  * the AGFL slot from freeing the block.
2470  */
2471 STATIC void
2472 xfs_defer_agfl_block(
2473     struct xfs_trans        *tp,
2474     xfs_agnumber_t          agno,
2475     xfs_fsblock_t           agbno,
2476     struct xfs_owner_info       *oinfo)
2477 {
2478     struct xfs_mount        *mp = tp->t_mountp;
2479     struct xfs_extent_free_item *new;       /* new element */
2480 
2481     ASSERT(xfs_extfree_item_cache != NULL);
2482     ASSERT(oinfo != NULL);
2483 
2484     new = kmem_cache_zalloc(xfs_extfree_item_cache,
2485                    GFP_KERNEL | __GFP_NOFAIL);
2486     new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2487     new->xefi_blockcount = 1;
2488     new->xefi_owner = oinfo->oi_owner;
2489 
2490     trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2491 
2492     xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2493 }
2494 
2495 /*
2496  * Add the extent to the list of extents to be free at transaction end.
2497  * The list is maintained sorted (by block number).
2498  */
2499 void
2500 __xfs_free_extent_later(
2501     struct xfs_trans        *tp,
2502     xfs_fsblock_t           bno,
2503     xfs_filblks_t           len,
2504     const struct xfs_owner_info *oinfo,
2505     bool                skip_discard)
2506 {
2507     struct xfs_extent_free_item *new;       /* new element */
2508 #ifdef DEBUG
2509     struct xfs_mount        *mp = tp->t_mountp;
2510     xfs_agnumber_t          agno;
2511     xfs_agblock_t           agbno;
2512 
2513     ASSERT(bno != NULLFSBLOCK);
2514     ASSERT(len > 0);
2515     ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2516     ASSERT(!isnullstartblock(bno));
2517     agno = XFS_FSB_TO_AGNO(mp, bno);
2518     agbno = XFS_FSB_TO_AGBNO(mp, bno);
2519     ASSERT(agno < mp->m_sb.sb_agcount);
2520     ASSERT(agbno < mp->m_sb.sb_agblocks);
2521     ASSERT(len < mp->m_sb.sb_agblocks);
2522     ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2523 #endif
2524     ASSERT(xfs_extfree_item_cache != NULL);
2525 
2526     new = kmem_cache_zalloc(xfs_extfree_item_cache,
2527                    GFP_KERNEL | __GFP_NOFAIL);
2528     new->xefi_startblock = bno;
2529     new->xefi_blockcount = (xfs_extlen_t)len;
2530     if (skip_discard)
2531         new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2532     if (oinfo) {
2533         ASSERT(oinfo->oi_offset == 0);
2534 
2535         if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2536             new->xefi_flags |= XFS_EFI_ATTR_FORK;
2537         if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2538             new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2539         new->xefi_owner = oinfo->oi_owner;
2540     } else {
2541         new->xefi_owner = XFS_RMAP_OWN_NULL;
2542     }
2543     trace_xfs_bmap_free_defer(tp->t_mountp,
2544             XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2545             XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2546     xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
2547 }
2548 
2549 #ifdef DEBUG
2550 /*
2551  * Check if an AGF has a free extent record whose length is equal to
2552  * args->minlen.
2553  */
2554 STATIC int
2555 xfs_exact_minlen_extent_available(
2556     struct xfs_alloc_arg    *args,
2557     struct xfs_buf      *agbp,
2558     int         *stat)
2559 {
2560     struct xfs_btree_cur    *cnt_cur;
2561     xfs_agblock_t       fbno;
2562     xfs_extlen_t        flen;
2563     int         error = 0;
2564 
2565     cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2566                     args->pag, XFS_BTNUM_CNT);
2567     error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2568     if (error)
2569         goto out;
2570 
2571     if (*stat == 0) {
2572         error = -EFSCORRUPTED;
2573         goto out;
2574     }
2575 
2576     error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2577     if (error)
2578         goto out;
2579 
2580     if (*stat == 1 && flen != args->minlen)
2581         *stat = 0;
2582 
2583 out:
2584     xfs_btree_del_cursor(cnt_cur, error);
2585 
2586     return error;
2587 }
2588 #endif
2589 
2590 /*
2591  * Decide whether to use this allocation group for this allocation.
2592  * If so, fix up the btree freelist's size.
2593  */
2594 int         /* error */
2595 xfs_alloc_fix_freelist(
2596     struct xfs_alloc_arg    *args,  /* allocation argument structure */
2597     int         flags)  /* XFS_ALLOC_FLAG_... */
2598 {
2599     struct xfs_mount    *mp = args->mp;
2600     struct xfs_perag    *pag = args->pag;
2601     struct xfs_trans    *tp = args->tp;
2602     struct xfs_buf      *agbp = NULL;
2603     struct xfs_buf      *agflbp = NULL;
2604     struct xfs_alloc_arg    targs;  /* local allocation arguments */
2605     xfs_agblock_t       bno;    /* freelist block */
2606     xfs_extlen_t        need;   /* total blocks needed in freelist */
2607     int         error = 0;
2608 
2609     /* deferred ops (AGFL block frees) require permanent transactions */
2610     ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2611 
2612     if (!pag->pagf_init) {
2613         error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2614         if (error) {
2615             /* Couldn't lock the AGF so skip this AG. */
2616             if (error == -EAGAIN)
2617                 error = 0;
2618             goto out_no_agbp;
2619         }
2620     }
2621 
2622     /*
2623      * If this is a metadata preferred pag and we are user data then try
2624      * somewhere else if we are not being asked to try harder at this
2625      * point
2626      */
2627     if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2628         (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2629         ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2630         goto out_agbp_relse;
2631     }
2632 
2633     need = xfs_alloc_min_freelist(mp, pag);
2634     if (!xfs_alloc_space_available(args, need, flags |
2635             XFS_ALLOC_FLAG_CHECK))
2636         goto out_agbp_relse;
2637 
2638     /*
2639      * Get the a.g. freespace buffer.
2640      * Can fail if we're not blocking on locks, and it's held.
2641      */
2642     if (!agbp) {
2643         error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2644         if (error) {
2645             /* Couldn't lock the AGF so skip this AG. */
2646             if (error == -EAGAIN)
2647                 error = 0;
2648             goto out_no_agbp;
2649         }
2650     }
2651 
2652     /* reset a padding mismatched agfl before final free space check */
2653     if (pag->pagf_agflreset)
2654         xfs_agfl_reset(tp, agbp, pag);
2655 
2656     /* If there isn't enough total space or single-extent, reject it. */
2657     need = xfs_alloc_min_freelist(mp, pag);
2658     if (!xfs_alloc_space_available(args, need, flags))
2659         goto out_agbp_relse;
2660 
2661 #ifdef DEBUG
2662     if (args->alloc_minlen_only) {
2663         int stat;
2664 
2665         error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2666         if (error || !stat)
2667             goto out_agbp_relse;
2668     }
2669 #endif
2670     /*
2671      * Make the freelist shorter if it's too long.
2672      *
2673      * Note that from this point onwards, we will always release the agf and
2674      * agfl buffers on error. This handles the case where we error out and
2675      * the buffers are clean or may not have been joined to the transaction
2676      * and hence need to be released manually. If they have been joined to
2677      * the transaction, then xfs_trans_brelse() will handle them
2678      * appropriately based on the recursion count and dirty state of the
2679      * buffer.
2680      *
2681      * XXX (dgc): When we have lots of free space, does this buy us
2682      * anything other than extra overhead when we need to put more blocks
2683      * back on the free list? Maybe we should only do this when space is
2684      * getting low or the AGFL is more than half full?
2685      *
2686      * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2687      * big; the NORMAP flag prevents AGFL expand/shrink operations from
2688      * updating the rmapbt.  Both flags are used in xfs_repair while we're
2689      * rebuilding the rmapbt, and neither are used by the kernel.  They're
2690      * both required to ensure that rmaps are correctly recorded for the
2691      * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2692      * repair/rmap.c in xfsprogs for details.
2693      */
2694     memset(&targs, 0, sizeof(targs));
2695     /* struct copy below */
2696     if (flags & XFS_ALLOC_FLAG_NORMAP)
2697         targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2698     else
2699         targs.oinfo = XFS_RMAP_OINFO_AG;
2700     while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2701         error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2702         if (error)
2703             goto out_agbp_relse;
2704 
2705         /* defer agfl frees */
2706         xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2707     }
2708 
2709     targs.tp = tp;
2710     targs.mp = mp;
2711     targs.agbp = agbp;
2712     targs.agno = args->agno;
2713     targs.alignment = targs.minlen = targs.prod = 1;
2714     targs.type = XFS_ALLOCTYPE_THIS_AG;
2715     targs.pag = pag;
2716     error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2717     if (error)
2718         goto out_agbp_relse;
2719 
2720     /* Make the freelist longer if it's too short. */
2721     while (pag->pagf_flcount < need) {
2722         targs.agbno = 0;
2723         targs.maxlen = need - pag->pagf_flcount;
2724         targs.resv = XFS_AG_RESV_AGFL;
2725 
2726         /* Allocate as many blocks as possible at once. */
2727         error = xfs_alloc_ag_vextent(&targs);
2728         if (error)
2729             goto out_agflbp_relse;
2730 
2731         /*
2732          * Stop if we run out.  Won't happen if callers are obeying
2733          * the restrictions correctly.  Can happen for free calls
2734          * on a completely full ag.
2735          */
2736         if (targs.agbno == NULLAGBLOCK) {
2737             if (flags & XFS_ALLOC_FLAG_FREEING)
2738                 break;
2739             goto out_agflbp_relse;
2740         }
2741         /*
2742          * Put each allocated block on the list.
2743          */
2744         for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2745             error = xfs_alloc_put_freelist(pag, tp, agbp,
2746                             agflbp, bno, 0);
2747             if (error)
2748                 goto out_agflbp_relse;
2749         }
2750     }
2751     xfs_trans_brelse(tp, agflbp);
2752     args->agbp = agbp;
2753     return 0;
2754 
2755 out_agflbp_relse:
2756     xfs_trans_brelse(tp, agflbp);
2757 out_agbp_relse:
2758     if (agbp)
2759         xfs_trans_brelse(tp, agbp);
2760 out_no_agbp:
2761     args->agbp = NULL;
2762     return error;
2763 }
2764 
2765 /*
2766  * Get a block from the freelist.
2767  * Returns with the buffer for the block gotten.
2768  */
2769 int
2770 xfs_alloc_get_freelist(
2771     struct xfs_perag    *pag,
2772     struct xfs_trans    *tp,
2773     struct xfs_buf      *agbp,
2774     xfs_agblock_t       *bnop,
2775     int         btreeblk)
2776 {
2777     struct xfs_agf      *agf = agbp->b_addr;
2778     struct xfs_buf      *agflbp;
2779     xfs_agblock_t       bno;
2780     __be32          *agfl_bno;
2781     int         error;
2782     uint32_t        logflags;
2783     struct xfs_mount    *mp = tp->t_mountp;
2784 
2785     /*
2786      * Freelist is empty, give up.
2787      */
2788     if (!agf->agf_flcount) {
2789         *bnop = NULLAGBLOCK;
2790         return 0;
2791     }
2792     /*
2793      * Read the array of free blocks.
2794      */
2795     error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2796     if (error)
2797         return error;
2798 
2799 
2800     /*
2801      * Get the block number and update the data structures.
2802      */
2803     agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2804     bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2805     be32_add_cpu(&agf->agf_flfirst, 1);
2806     xfs_trans_brelse(tp, agflbp);
2807     if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2808         agf->agf_flfirst = 0;
2809 
2810     ASSERT(!pag->pagf_agflreset);
2811     be32_add_cpu(&agf->agf_flcount, -1);
2812     pag->pagf_flcount--;
2813 
2814     logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2815     if (btreeblk) {
2816         be32_add_cpu(&agf->agf_btreeblks, 1);
2817         pag->pagf_btreeblks++;
2818         logflags |= XFS_AGF_BTREEBLKS;
2819     }
2820 
2821     xfs_alloc_log_agf(tp, agbp, logflags);
2822     *bnop = bno;
2823 
2824     return 0;
2825 }
2826 
2827 /*
2828  * Log the given fields from the agf structure.
2829  */
2830 void
2831 xfs_alloc_log_agf(
2832     struct xfs_trans    *tp,
2833     struct xfs_buf      *bp,
2834     uint32_t        fields)
2835 {
2836     int first;      /* first byte offset */
2837     int last;       /* last byte offset */
2838     static const short  offsets[] = {
2839         offsetof(xfs_agf_t, agf_magicnum),
2840         offsetof(xfs_agf_t, agf_versionnum),
2841         offsetof(xfs_agf_t, agf_seqno),
2842         offsetof(xfs_agf_t, agf_length),
2843         offsetof(xfs_agf_t, agf_roots[0]),
2844         offsetof(xfs_agf_t, agf_levels[0]),
2845         offsetof(xfs_agf_t, agf_flfirst),
2846         offsetof(xfs_agf_t, agf_fllast),
2847         offsetof(xfs_agf_t, agf_flcount),
2848         offsetof(xfs_agf_t, agf_freeblks),
2849         offsetof(xfs_agf_t, agf_longest),
2850         offsetof(xfs_agf_t, agf_btreeblks),
2851         offsetof(xfs_agf_t, agf_uuid),
2852         offsetof(xfs_agf_t, agf_rmap_blocks),
2853         offsetof(xfs_agf_t, agf_refcount_blocks),
2854         offsetof(xfs_agf_t, agf_refcount_root),
2855         offsetof(xfs_agf_t, agf_refcount_level),
2856         /* needed so that we don't log the whole rest of the structure: */
2857         offsetof(xfs_agf_t, agf_spare64),
2858         sizeof(xfs_agf_t)
2859     };
2860 
2861     trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2862 
2863     xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2864 
2865     xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2866     xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2867 }
2868 
2869 /*
2870  * Put the block on the freelist for the allocation group.
2871  */
2872 int
2873 xfs_alloc_put_freelist(
2874     struct xfs_perag    *pag,
2875     struct xfs_trans    *tp,
2876     struct xfs_buf      *agbp,
2877     struct xfs_buf      *agflbp,
2878     xfs_agblock_t       bno,
2879     int         btreeblk)
2880 {
2881     struct xfs_mount    *mp = tp->t_mountp;
2882     struct xfs_agf      *agf = agbp->b_addr;
2883     __be32          *blockp;
2884     int         error;
2885     uint32_t        logflags;
2886     __be32          *agfl_bno;
2887     int         startoff;
2888 
2889     if (!agflbp) {
2890         error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2891         if (error)
2892             return error;
2893     }
2894 
2895     be32_add_cpu(&agf->agf_fllast, 1);
2896     if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2897         agf->agf_fllast = 0;
2898 
2899     ASSERT(!pag->pagf_agflreset);
2900     be32_add_cpu(&agf->agf_flcount, 1);
2901     pag->pagf_flcount++;
2902 
2903     logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2904     if (btreeblk) {
2905         be32_add_cpu(&agf->agf_btreeblks, -1);
2906         pag->pagf_btreeblks--;
2907         logflags |= XFS_AGF_BTREEBLKS;
2908     }
2909 
2910     xfs_alloc_log_agf(tp, agbp, logflags);
2911 
2912     ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2913 
2914     agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2915     blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2916     *blockp = cpu_to_be32(bno);
2917     startoff = (char *)blockp - (char *)agflbp->b_addr;
2918 
2919     xfs_alloc_log_agf(tp, agbp, logflags);
2920 
2921     xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2922     xfs_trans_log_buf(tp, agflbp, startoff,
2923               startoff + sizeof(xfs_agblock_t) - 1);
2924     return 0;
2925 }
2926 
2927 static xfs_failaddr_t
2928 xfs_agf_verify(
2929     struct xfs_buf      *bp)
2930 {
2931     struct xfs_mount    *mp = bp->b_mount;
2932     struct xfs_agf      *agf = bp->b_addr;
2933 
2934     if (xfs_has_crc(mp)) {
2935         if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2936             return __this_address;
2937         if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2938             return __this_address;
2939     }
2940 
2941     if (!xfs_verify_magic(bp, agf->agf_magicnum))
2942         return __this_address;
2943 
2944     if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2945           be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2946           be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2947           be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2948           be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2949         return __this_address;
2950 
2951     if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2952         return __this_address;
2953 
2954     if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2955         be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2956         return __this_address;
2957 
2958     if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2959         be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2960         be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
2961                         mp->m_alloc_maxlevels ||
2962         be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
2963                         mp->m_alloc_maxlevels)
2964         return __this_address;
2965 
2966     if (xfs_has_rmapbt(mp) &&
2967         (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2968          be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
2969                         mp->m_rmap_maxlevels))
2970         return __this_address;
2971 
2972     if (xfs_has_rmapbt(mp) &&
2973         be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2974         return __this_address;
2975 
2976     /*
2977      * during growfs operations, the perag is not fully initialised,
2978      * so we can't use it for any useful checking. growfs ensures we can't
2979      * use it by using uncached buffers that don't have the perag attached
2980      * so we can detect and avoid this problem.
2981      */
2982     if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2983         return __this_address;
2984 
2985     if (xfs_has_lazysbcount(mp) &&
2986         be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2987         return __this_address;
2988 
2989     if (xfs_has_reflink(mp) &&
2990         be32_to_cpu(agf->agf_refcount_blocks) >
2991         be32_to_cpu(agf->agf_length))
2992         return __this_address;
2993 
2994     if (xfs_has_reflink(mp) &&
2995         (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2996          be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
2997         return __this_address;
2998 
2999     return NULL;
3000 
3001 }
3002 
3003 static void
3004 xfs_agf_read_verify(
3005     struct xfs_buf  *bp)
3006 {
3007     struct xfs_mount *mp = bp->b_mount;
3008     xfs_failaddr_t  fa;
3009 
3010     if (xfs_has_crc(mp) &&
3011         !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3012         xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3013     else {
3014         fa = xfs_agf_verify(bp);
3015         if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3016             xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3017     }
3018 }
3019 
3020 static void
3021 xfs_agf_write_verify(
3022     struct xfs_buf  *bp)
3023 {
3024     struct xfs_mount    *mp = bp->b_mount;
3025     struct xfs_buf_log_item *bip = bp->b_log_item;
3026     struct xfs_agf      *agf = bp->b_addr;
3027     xfs_failaddr_t      fa;
3028 
3029     fa = xfs_agf_verify(bp);
3030     if (fa) {
3031         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3032         return;
3033     }
3034 
3035     if (!xfs_has_crc(mp))
3036         return;
3037 
3038     if (bip)
3039         agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3040 
3041     xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3042 }
3043 
3044 const struct xfs_buf_ops xfs_agf_buf_ops = {
3045     .name = "xfs_agf",
3046     .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3047     .verify_read = xfs_agf_read_verify,
3048     .verify_write = xfs_agf_write_verify,
3049     .verify_struct = xfs_agf_verify,
3050 };
3051 
3052 /*
3053  * Read in the allocation group header (free/alloc section).
3054  */
3055 int
3056 xfs_read_agf(
3057     struct xfs_perag    *pag,
3058     struct xfs_trans    *tp,
3059     int         flags,
3060     struct xfs_buf      **agfbpp)
3061 {
3062     struct xfs_mount    *mp = pag->pag_mount;
3063     int         error;
3064 
3065     trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3066 
3067     error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3068             XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3069             XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3070     if (error)
3071         return error;
3072 
3073     xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3074     return 0;
3075 }
3076 
3077 /*
3078  * Read in the allocation group header (free/alloc section) and initialise the
3079  * perag structure if necessary. If the caller provides @agfbpp, then return the
3080  * locked buffer to the caller, otherwise free it.
3081  */
3082 int
3083 xfs_alloc_read_agf(
3084     struct xfs_perag    *pag,
3085     struct xfs_trans    *tp,
3086     int         flags,
3087     struct xfs_buf      **agfbpp)
3088 {
3089     struct xfs_buf      *agfbp;
3090     struct xfs_agf      *agf;
3091     int         error;
3092     int         allocbt_blks;
3093 
3094     trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3095 
3096     /* We don't support trylock when freeing. */
3097     ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3098             (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3099     error = xfs_read_agf(pag, tp,
3100             (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3101             &agfbp);
3102     if (error)
3103         return error;
3104 
3105     agf = agfbp->b_addr;
3106     if (!pag->pagf_init) {
3107         pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3108         pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3109         pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3110         pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3111         pag->pagf_levels[XFS_BTNUM_BNOi] =
3112             be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3113         pag->pagf_levels[XFS_BTNUM_CNTi] =
3114             be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3115         pag->pagf_levels[XFS_BTNUM_RMAPi] =
3116             be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3117         pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3118         pag->pagf_init = 1;
3119         pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
3120 
3121         /*
3122          * Update the in-core allocbt counter. Filter out the rmapbt
3123          * subset of the btreeblks counter because the rmapbt is managed
3124          * by perag reservation. Subtract one for the rmapbt root block
3125          * because the rmap counter includes it while the btreeblks
3126          * counter only tracks non-root blocks.
3127          */
3128         allocbt_blks = pag->pagf_btreeblks;
3129         if (xfs_has_rmapbt(pag->pag_mount))
3130             allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3131         if (allocbt_blks > 0)
3132             atomic64_add(allocbt_blks,
3133                     &pag->pag_mount->m_allocbt_blks);
3134     }
3135 #ifdef DEBUG
3136     else if (!xfs_is_shutdown(pag->pag_mount)) {
3137         ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3138         ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3139         ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3140         ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3141         ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3142                be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3143         ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3144                be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3145     }
3146 #endif
3147     if (agfbpp)
3148         *agfbpp = agfbp;
3149     else
3150         xfs_trans_brelse(tp, agfbp);
3151     return 0;
3152 }
3153 
3154 /*
3155  * Allocate an extent (variable-size).
3156  * Depending on the allocation type, we either look in a single allocation
3157  * group or loop over the allocation groups to find the result.
3158  */
3159 int             /* error */
3160 xfs_alloc_vextent(
3161     struct xfs_alloc_arg    *args)  /* allocation argument structure */
3162 {
3163     xfs_agblock_t       agsize; /* allocation group size */
3164     int         error;
3165     int         flags;  /* XFS_ALLOC_FLAG_... locking flags */
3166     struct xfs_mount    *mp;    /* mount structure pointer */
3167     xfs_agnumber_t      sagno;  /* starting allocation group number */
3168     xfs_alloctype_t     type;   /* input allocation type */
3169     int         bump_rotor = 0;
3170     xfs_agnumber_t      rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3171 
3172     mp = args->mp;
3173     type = args->otype = args->type;
3174     args->agbno = NULLAGBLOCK;
3175     /*
3176      * Just fix this up, for the case where the last a.g. is shorter
3177      * (or there's only one a.g.) and the caller couldn't easily figure
3178      * that out (xfs_bmap_alloc).
3179      */
3180     agsize = mp->m_sb.sb_agblocks;
3181     if (args->maxlen > agsize)
3182         args->maxlen = agsize;
3183     if (args->alignment == 0)
3184         args->alignment = 1;
3185     ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3186     ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3187     ASSERT(args->minlen <= args->maxlen);
3188     ASSERT(args->minlen <= agsize);
3189     ASSERT(args->mod < args->prod);
3190     if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3191         XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3192         args->minlen > args->maxlen || args->minlen > agsize ||
3193         args->mod >= args->prod) {
3194         args->fsbno = NULLFSBLOCK;
3195         trace_xfs_alloc_vextent_badargs(args);
3196         return 0;
3197     }
3198 
3199     switch (type) {
3200     case XFS_ALLOCTYPE_THIS_AG:
3201     case XFS_ALLOCTYPE_NEAR_BNO:
3202     case XFS_ALLOCTYPE_THIS_BNO:
3203         /*
3204          * These three force us into a single a.g.
3205          */
3206         args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3207         args->pag = xfs_perag_get(mp, args->agno);
3208         error = xfs_alloc_fix_freelist(args, 0);
3209         if (error) {
3210             trace_xfs_alloc_vextent_nofix(args);
3211             goto error0;
3212         }
3213         if (!args->agbp) {
3214             trace_xfs_alloc_vextent_noagbp(args);
3215             break;
3216         }
3217         args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3218         if ((error = xfs_alloc_ag_vextent(args)))
3219             goto error0;
3220         break;
3221     case XFS_ALLOCTYPE_START_BNO:
3222         /*
3223          * Try near allocation first, then anywhere-in-ag after
3224          * the first a.g. fails.
3225          */
3226         if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3227             xfs_is_inode32(mp)) {
3228             args->fsbno = XFS_AGB_TO_FSB(mp,
3229                     ((mp->m_agfrotor / rotorstep) %
3230                     mp->m_sb.sb_agcount), 0);
3231             bump_rotor = 1;
3232         }
3233         args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3234         args->type = XFS_ALLOCTYPE_NEAR_BNO;
3235         fallthrough;
3236     case XFS_ALLOCTYPE_FIRST_AG:
3237         /*
3238          * Rotate through the allocation groups looking for a winner.
3239          */
3240         if (type == XFS_ALLOCTYPE_FIRST_AG) {
3241             /*
3242              * Start with allocation group given by bno.
3243              */
3244             args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3245             args->type = XFS_ALLOCTYPE_THIS_AG;
3246             sagno = 0;
3247             flags = 0;
3248         } else {
3249             /*
3250              * Start with the given allocation group.
3251              */
3252             args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3253             flags = XFS_ALLOC_FLAG_TRYLOCK;
3254         }
3255         /*
3256          * Loop over allocation groups twice; first time with
3257          * trylock set, second time without.
3258          */
3259         for (;;) {
3260             args->pag = xfs_perag_get(mp, args->agno);
3261             error = xfs_alloc_fix_freelist(args, flags);
3262             if (error) {
3263                 trace_xfs_alloc_vextent_nofix(args);
3264                 goto error0;
3265             }
3266             /*
3267              * If we get a buffer back then the allocation will fly.
3268              */
3269             if (args->agbp) {
3270                 if ((error = xfs_alloc_ag_vextent(args)))
3271                     goto error0;
3272                 break;
3273             }
3274 
3275             trace_xfs_alloc_vextent_loopfailed(args);
3276 
3277             /*
3278              * Didn't work, figure out the next iteration.
3279              */
3280             if (args->agno == sagno &&
3281                 type == XFS_ALLOCTYPE_START_BNO)
3282                 args->type = XFS_ALLOCTYPE_THIS_AG;
3283             /*
3284             * For the first allocation, we can try any AG to get
3285             * space.  However, if we already have allocated a
3286             * block, we don't want to try AGs whose number is below
3287             * sagno. Otherwise, we may end up with out-of-order
3288             * locking of AGF, which might cause deadlock.
3289             */
3290             if (++(args->agno) == mp->m_sb.sb_agcount) {
3291                 if (args->tp->t_firstblock != NULLFSBLOCK)
3292                     args->agno = sagno;
3293                 else
3294                     args->agno = 0;
3295             }
3296             /*
3297              * Reached the starting a.g., must either be done
3298              * or switch to non-trylock mode.
3299              */
3300             if (args->agno == sagno) {
3301                 if (flags == 0) {
3302                     args->agbno = NULLAGBLOCK;
3303                     trace_xfs_alloc_vextent_allfailed(args);
3304                     break;
3305                 }
3306 
3307                 flags = 0;
3308                 if (type == XFS_ALLOCTYPE_START_BNO) {
3309                     args->agbno = XFS_FSB_TO_AGBNO(mp,
3310                         args->fsbno);
3311                     args->type = XFS_ALLOCTYPE_NEAR_BNO;
3312                 }
3313             }
3314             xfs_perag_put(args->pag);
3315         }
3316         if (bump_rotor) {
3317             if (args->agno == sagno)
3318                 mp->m_agfrotor = (mp->m_agfrotor + 1) %
3319                     (mp->m_sb.sb_agcount * rotorstep);
3320             else
3321                 mp->m_agfrotor = (args->agno * rotorstep + 1) %
3322                     (mp->m_sb.sb_agcount * rotorstep);
3323         }
3324         break;
3325     default:
3326         ASSERT(0);
3327         /* NOTREACHED */
3328     }
3329     if (args->agbno == NULLAGBLOCK)
3330         args->fsbno = NULLFSBLOCK;
3331     else {
3332         args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3333 #ifdef DEBUG
3334         ASSERT(args->len >= args->minlen);
3335         ASSERT(args->len <= args->maxlen);
3336         ASSERT(args->agbno % args->alignment == 0);
3337         XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3338             args->len);
3339 #endif
3340 
3341     }
3342     xfs_perag_put(args->pag);
3343     return 0;
3344 error0:
3345     xfs_perag_put(args->pag);
3346     return error;
3347 }
3348 
3349 /* Ensure that the freelist is at full capacity. */
3350 int
3351 xfs_free_extent_fix_freelist(
3352     struct xfs_trans    *tp,
3353     struct xfs_perag    *pag,
3354     struct xfs_buf      **agbp)
3355 {
3356     struct xfs_alloc_arg    args;
3357     int         error;
3358 
3359     memset(&args, 0, sizeof(struct xfs_alloc_arg));
3360     args.tp = tp;
3361     args.mp = tp->t_mountp;
3362     args.agno = pag->pag_agno;
3363     args.pag = pag;
3364 
3365     /*
3366      * validate that the block number is legal - the enables us to detect
3367      * and handle a silent filesystem corruption rather than crashing.
3368      */
3369     if (args.agno >= args.mp->m_sb.sb_agcount)
3370         return -EFSCORRUPTED;
3371 
3372     error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3373     if (error)
3374         return error;
3375 
3376     *agbp = args.agbp;
3377     return 0;
3378 }
3379 
3380 /*
3381  * Free an extent.
3382  * Just break up the extent address and hand off to xfs_free_ag_extent
3383  * after fixing up the freelist.
3384  */
3385 int
3386 __xfs_free_extent(
3387     struct xfs_trans        *tp,
3388     xfs_fsblock_t           bno,
3389     xfs_extlen_t            len,
3390     const struct xfs_owner_info *oinfo,
3391     enum xfs_ag_resv_type       type,
3392     bool                skip_discard)
3393 {
3394     struct xfs_mount        *mp = tp->t_mountp;
3395     struct xfs_buf          *agbp;
3396     xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, bno);
3397     xfs_agblock_t           agbno = XFS_FSB_TO_AGBNO(mp, bno);
3398     struct xfs_agf          *agf;
3399     int             error;
3400     unsigned int            busy_flags = 0;
3401     struct xfs_perag        *pag;
3402 
3403     ASSERT(len != 0);
3404     ASSERT(type != XFS_AG_RESV_AGFL);
3405 
3406     if (XFS_TEST_ERROR(false, mp,
3407             XFS_ERRTAG_FREE_EXTENT))
3408         return -EIO;
3409 
3410     pag = xfs_perag_get(mp, agno);
3411     error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3412     if (error)
3413         goto err;
3414     agf = agbp->b_addr;
3415 
3416     if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3417         error = -EFSCORRUPTED;
3418         goto err_release;
3419     }
3420 
3421     /* validate the extent size is legal now we have the agf locked */
3422     if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3423         error = -EFSCORRUPTED;
3424         goto err_release;
3425     }
3426 
3427     error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3428     if (error)
3429         goto err_release;
3430 
3431     if (skip_discard)
3432         busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3433     xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3434     xfs_perag_put(pag);
3435     return 0;
3436 
3437 err_release:
3438     xfs_trans_brelse(tp, agbp);
3439 err:
3440     xfs_perag_put(pag);
3441     return error;
3442 }
3443 
3444 struct xfs_alloc_query_range_info {
3445     xfs_alloc_query_range_fn    fn;
3446     void                *priv;
3447 };
3448 
3449 /* Format btree record and pass to our callback. */
3450 STATIC int
3451 xfs_alloc_query_range_helper(
3452     struct xfs_btree_cur        *cur,
3453     const union xfs_btree_rec   *rec,
3454     void                *priv)
3455 {
3456     struct xfs_alloc_query_range_info   *query = priv;
3457     struct xfs_alloc_rec_incore     irec;
3458 
3459     irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3460     irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3461     return query->fn(cur, &irec, query->priv);
3462 }
3463 
3464 /* Find all free space within a given range of blocks. */
3465 int
3466 xfs_alloc_query_range(
3467     struct xfs_btree_cur            *cur,
3468     const struct xfs_alloc_rec_incore   *low_rec,
3469     const struct xfs_alloc_rec_incore   *high_rec,
3470     xfs_alloc_query_range_fn        fn,
3471     void                    *priv)
3472 {
3473     union xfs_btree_irec            low_brec;
3474     union xfs_btree_irec            high_brec;
3475     struct xfs_alloc_query_range_info   query;
3476 
3477     ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3478     low_brec.a = *low_rec;
3479     high_brec.a = *high_rec;
3480     query.priv = priv;
3481     query.fn = fn;
3482     return xfs_btree_query_range(cur, &low_brec, &high_brec,
3483             xfs_alloc_query_range_helper, &query);
3484 }
3485 
3486 /* Find all free space records. */
3487 int
3488 xfs_alloc_query_all(
3489     struct xfs_btree_cur            *cur,
3490     xfs_alloc_query_range_fn        fn,
3491     void                    *priv)
3492 {
3493     struct xfs_alloc_query_range_info   query;
3494 
3495     ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3496     query.priv = priv;
3497     query.fn = fn;
3498     return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3499 }
3500 
3501 /* Is there a record covering a given extent? */
3502 int
3503 xfs_alloc_has_record(
3504     struct xfs_btree_cur    *cur,
3505     xfs_agblock_t       bno,
3506     xfs_extlen_t        len,
3507     bool            *exists)
3508 {
3509     union xfs_btree_irec    low;
3510     union xfs_btree_irec    high;
3511 
3512     memset(&low, 0, sizeof(low));
3513     low.a.ar_startblock = bno;
3514     memset(&high, 0xFF, sizeof(high));
3515     high.a.ar_startblock = bno + len - 1;
3516 
3517     return xfs_btree_has_record(cur, &low, &high, exists);
3518 }
3519 
3520 /*
3521  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
3522  * error code or XFS_ITER_*.
3523  */
3524 int
3525 xfs_agfl_walk(
3526     struct xfs_mount    *mp,
3527     struct xfs_agf      *agf,
3528     struct xfs_buf      *agflbp,
3529     xfs_agfl_walk_fn    walk_fn,
3530     void            *priv)
3531 {
3532     __be32          *agfl_bno;
3533     unsigned int        i;
3534     int         error;
3535 
3536     agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3537     i = be32_to_cpu(agf->agf_flfirst);
3538 
3539     /* Nothing to walk in an empty AGFL. */
3540     if (agf->agf_flcount == cpu_to_be32(0))
3541         return 0;
3542 
3543     /* Otherwise, walk from first to last, wrapping as needed. */
3544     for (;;) {
3545         error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3546         if (error)
3547             return error;
3548         if (i == be32_to_cpu(agf->agf_fllast))
3549             break;
3550         if (++i == xfs_agfl_size(mp))
3551             i = 0;
3552     }
3553 
3554     return 0;
3555 }
3556 
3557 int __init
3558 xfs_extfree_intent_init_cache(void)
3559 {
3560     xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
3561             sizeof(struct xfs_extent_free_item),
3562             0, 0, NULL);
3563 
3564     return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
3565 }
3566 
3567 void
3568 xfs_extfree_intent_destroy_cache(void)
3569 {
3570     kmem_cache_destroy(xfs_extfree_item_cache);
3571     xfs_extfree_item_cache = NULL;
3572 }