Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
0004  * Copyright (c) 2010 David Chinner.
0005  * Copyright (c) 2011 Christoph Hellwig.
0006  * All Rights Reserved.
0007  */
0008 #include "xfs.h"
0009 #include "xfs_fs.h"
0010 #include "xfs_format.h"
0011 #include "xfs_log_format.h"
0012 #include "xfs_shared.h"
0013 #include "xfs_trans_resv.h"
0014 #include "xfs_mount.h"
0015 #include "xfs_alloc.h"
0016 #include "xfs_extent_busy.h"
0017 #include "xfs_trace.h"
0018 #include "xfs_trans.h"
0019 #include "xfs_log.h"
0020 #include "xfs_ag.h"
0021 
0022 void
0023 xfs_extent_busy_insert(
0024     struct xfs_trans    *tp,
0025     struct xfs_perag    *pag,
0026     xfs_agblock_t       bno,
0027     xfs_extlen_t        len,
0028     unsigned int        flags)
0029 {
0030     struct xfs_extent_busy  *new;
0031     struct xfs_extent_busy  *busyp;
0032     struct rb_node      **rbp;
0033     struct rb_node      *parent = NULL;
0034 
0035     new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
0036     new->agno = pag->pag_agno;
0037     new->bno = bno;
0038     new->length = len;
0039     INIT_LIST_HEAD(&new->list);
0040     new->flags = flags;
0041 
0042     /* trace before insert to be able to see failed inserts */
0043     trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len);
0044 
0045     spin_lock(&pag->pagb_lock);
0046     rbp = &pag->pagb_tree.rb_node;
0047     while (*rbp) {
0048         parent = *rbp;
0049         busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
0050 
0051         if (new->bno < busyp->bno) {
0052             rbp = &(*rbp)->rb_left;
0053             ASSERT(new->bno + new->length <= busyp->bno);
0054         } else if (new->bno > busyp->bno) {
0055             rbp = &(*rbp)->rb_right;
0056             ASSERT(bno >= busyp->bno + busyp->length);
0057         } else {
0058             ASSERT(0);
0059         }
0060     }
0061 
0062     rb_link_node(&new->rb_node, parent, rbp);
0063     rb_insert_color(&new->rb_node, &pag->pagb_tree);
0064 
0065     list_add(&new->list, &tp->t_busy);
0066     spin_unlock(&pag->pagb_lock);
0067 }
0068 
0069 /*
0070  * Search for a busy extent within the range of the extent we are about to
0071  * allocate.  You need to be holding the busy extent tree lock when calling
0072  * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
0073  * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
0074  * match. This is done so that a non-zero return indicates an overlap that
0075  * will require a synchronous transaction, but it can still be
0076  * used to distinguish between a partial or exact match.
0077  */
0078 int
0079 xfs_extent_busy_search(
0080     struct xfs_mount    *mp,
0081     struct xfs_perag    *pag,
0082     xfs_agblock_t       bno,
0083     xfs_extlen_t        len)
0084 {
0085     struct rb_node      *rbp;
0086     struct xfs_extent_busy  *busyp;
0087     int         match = 0;
0088 
0089     /* find closest start bno overlap */
0090     spin_lock(&pag->pagb_lock);
0091     rbp = pag->pagb_tree.rb_node;
0092     while (rbp) {
0093         busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
0094         if (bno < busyp->bno) {
0095             /* may overlap, but exact start block is lower */
0096             if (bno + len > busyp->bno)
0097                 match = -1;
0098             rbp = rbp->rb_left;
0099         } else if (bno > busyp->bno) {
0100             /* may overlap, but exact start block is higher */
0101             if (bno < busyp->bno + busyp->length)
0102                 match = -1;
0103             rbp = rbp->rb_right;
0104         } else {
0105             /* bno matches busyp, length determines exact match */
0106             match = (busyp->length == len) ? 1 : -1;
0107             break;
0108         }
0109     }
0110     spin_unlock(&pag->pagb_lock);
0111     return match;
0112 }
0113 
0114 /*
0115  * The found free extent [fbno, fend] overlaps part or all of the given busy
0116  * extent.  If the overlap covers the beginning, the end, or all of the busy
0117  * extent, the overlapping portion can be made unbusy and used for the
0118  * allocation.  We can't split a busy extent because we can't modify a
0119  * transaction/CIL context busy list, but we can update an entry's block
0120  * number or length.
0121  *
0122  * Returns true if the extent can safely be reused, or false if the search
0123  * needs to be restarted.
0124  */
0125 STATIC bool
0126 xfs_extent_busy_update_extent(
0127     struct xfs_mount    *mp,
0128     struct xfs_perag    *pag,
0129     struct xfs_extent_busy  *busyp,
0130     xfs_agblock_t       fbno,
0131     xfs_extlen_t        flen,
0132     bool            userdata) __releases(&pag->pagb_lock)
0133                       __acquires(&pag->pagb_lock)
0134 {
0135     xfs_agblock_t       fend = fbno + flen;
0136     xfs_agblock_t       bbno = busyp->bno;
0137     xfs_agblock_t       bend = bbno + busyp->length;
0138 
0139     /*
0140      * This extent is currently being discarded.  Give the thread
0141      * performing the discard a chance to mark the extent unbusy
0142      * and retry.
0143      */
0144     if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
0145         spin_unlock(&pag->pagb_lock);
0146         delay(1);
0147         spin_lock(&pag->pagb_lock);
0148         return false;
0149     }
0150 
0151     /*
0152      * If there is a busy extent overlapping a user allocation, we have
0153      * no choice but to force the log and retry the search.
0154      *
0155      * Fortunately this does not happen during normal operation, but
0156      * only if the filesystem is very low on space and has to dip into
0157      * the AGFL for normal allocations.
0158      */
0159     if (userdata)
0160         goto out_force_log;
0161 
0162     if (bbno < fbno && bend > fend) {
0163         /*
0164          * Case 1:
0165          *    bbno           bend
0166          *    +BBBBBBBBBBBBBBBBB+
0167          *        +---------+
0168          *        fbno   fend
0169          */
0170 
0171         /*
0172          * We would have to split the busy extent to be able to track
0173          * it correct, which we cannot do because we would have to
0174          * modify the list of busy extents attached to the transaction
0175          * or CIL context, which is immutable.
0176          *
0177          * Force out the log to clear the busy extent and retry the
0178          * search.
0179          */
0180         goto out_force_log;
0181     } else if (bbno >= fbno && bend <= fend) {
0182         /*
0183          * Case 2:
0184          *    bbno           bend
0185          *    +BBBBBBBBBBBBBBBBB+
0186          *    +-----------------+
0187          *    fbno           fend
0188          *
0189          * Case 3:
0190          *    bbno           bend
0191          *    +BBBBBBBBBBBBBBBBB+
0192          *    +--------------------------+
0193          *    fbno                    fend
0194          *
0195          * Case 4:
0196          *             bbno           bend
0197          *             +BBBBBBBBBBBBBBBBB+
0198          *    +--------------------------+
0199          *    fbno                    fend
0200          *
0201          * Case 5:
0202          *             bbno           bend
0203          *             +BBBBBBBBBBBBBBBBB+
0204          *    +-----------------------------------+
0205          *    fbno                             fend
0206          *
0207          */
0208 
0209         /*
0210          * The busy extent is fully covered by the extent we are
0211          * allocating, and can simply be removed from the rbtree.
0212          * However we cannot remove it from the immutable list
0213          * tracking busy extents in the transaction or CIL context,
0214          * so set the length to zero to mark it invalid.
0215          *
0216          * We also need to restart the busy extent search from the
0217          * tree root, because erasing the node can rearrange the
0218          * tree topology.
0219          */
0220         rb_erase(&busyp->rb_node, &pag->pagb_tree);
0221         busyp->length = 0;
0222         return false;
0223     } else if (fend < bend) {
0224         /*
0225          * Case 6:
0226          *              bbno           bend
0227          *             +BBBBBBBBBBBBBBBBB+
0228          *             +---------+
0229          *             fbno   fend
0230          *
0231          * Case 7:
0232          *             bbno           bend
0233          *             +BBBBBBBBBBBBBBBBB+
0234          *    +------------------+
0235          *    fbno            fend
0236          *
0237          */
0238         busyp->bno = fend;
0239     } else if (bbno < fbno) {
0240         /*
0241          * Case 8:
0242          *    bbno           bend
0243          *    +BBBBBBBBBBBBBBBBB+
0244          *        +-------------+
0245          *        fbno       fend
0246          *
0247          * Case 9:
0248          *    bbno           bend
0249          *    +BBBBBBBBBBBBBBBBB+
0250          *        +----------------------+
0251          *        fbno                fend
0252          */
0253         busyp->length = fbno - busyp->bno;
0254     } else {
0255         ASSERT(0);
0256     }
0257 
0258     trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
0259     return true;
0260 
0261 out_force_log:
0262     spin_unlock(&pag->pagb_lock);
0263     xfs_log_force(mp, XFS_LOG_SYNC);
0264     trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
0265     spin_lock(&pag->pagb_lock);
0266     return false;
0267 }
0268 
0269 
0270 /*
0271  * For a given extent [fbno, flen], make sure we can reuse it safely.
0272  */
0273 void
0274 xfs_extent_busy_reuse(
0275     struct xfs_mount    *mp,
0276     struct xfs_perag    *pag,
0277     xfs_agblock_t       fbno,
0278     xfs_extlen_t        flen,
0279     bool            userdata)
0280 {
0281     struct rb_node      *rbp;
0282 
0283     ASSERT(flen > 0);
0284     spin_lock(&pag->pagb_lock);
0285 restart:
0286     rbp = pag->pagb_tree.rb_node;
0287     while (rbp) {
0288         struct xfs_extent_busy *busyp =
0289             rb_entry(rbp, struct xfs_extent_busy, rb_node);
0290         xfs_agblock_t   bbno = busyp->bno;
0291         xfs_agblock_t   bend = bbno + busyp->length;
0292 
0293         if (fbno + flen <= bbno) {
0294             rbp = rbp->rb_left;
0295             continue;
0296         } else if (fbno >= bend) {
0297             rbp = rbp->rb_right;
0298             continue;
0299         }
0300 
0301         if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
0302                           userdata))
0303             goto restart;
0304     }
0305     spin_unlock(&pag->pagb_lock);
0306 }
0307 
0308 /*
0309  * For a given extent [fbno, flen], search the busy extent list to find a
0310  * subset of the extent that is not busy.  If *rlen is smaller than
0311  * args->minlen no suitable extent could be found, and the higher level
0312  * code needs to force out the log and retry the allocation.
0313  *
0314  * Return the current busy generation for the AG if the extent is busy. This
0315  * value can be used to wait for at least one of the currently busy extents
0316  * to be cleared. Note that the busy list is not guaranteed to be empty after
0317  * the gen is woken. The state of a specific extent must always be confirmed
0318  * with another call to xfs_extent_busy_trim() before it can be used.
0319  */
0320 bool
0321 xfs_extent_busy_trim(
0322     struct xfs_alloc_arg    *args,
0323     xfs_agblock_t       *bno,
0324     xfs_extlen_t        *len,
0325     unsigned        *busy_gen)
0326 {
0327     xfs_agblock_t       fbno;
0328     xfs_extlen_t        flen;
0329     struct rb_node      *rbp;
0330     bool            ret = false;
0331 
0332     ASSERT(*len > 0);
0333 
0334     spin_lock(&args->pag->pagb_lock);
0335     fbno = *bno;
0336     flen = *len;
0337     rbp = args->pag->pagb_tree.rb_node;
0338     while (rbp && flen >= args->minlen) {
0339         struct xfs_extent_busy *busyp =
0340             rb_entry(rbp, struct xfs_extent_busy, rb_node);
0341         xfs_agblock_t   fend = fbno + flen;
0342         xfs_agblock_t   bbno = busyp->bno;
0343         xfs_agblock_t   bend = bbno + busyp->length;
0344 
0345         if (fend <= bbno) {
0346             rbp = rbp->rb_left;
0347             continue;
0348         } else if (fbno >= bend) {
0349             rbp = rbp->rb_right;
0350             continue;
0351         }
0352 
0353         if (bbno <= fbno) {
0354             /* start overlap */
0355 
0356             /*
0357              * Case 1:
0358              *    bbno           bend
0359              *    +BBBBBBBBBBBBBBBBB+
0360              *        +---------+
0361              *        fbno   fend
0362              *
0363              * Case 2:
0364              *    bbno           bend
0365              *    +BBBBBBBBBBBBBBBBB+
0366              *    +-------------+
0367              *    fbno       fend
0368              *
0369              * Case 3:
0370              *    bbno           bend
0371              *    +BBBBBBBBBBBBBBBBB+
0372              *        +-------------+
0373              *        fbno       fend
0374              *
0375              * Case 4:
0376              *    bbno           bend
0377              *    +BBBBBBBBBBBBBBBBB+
0378              *    +-----------------+
0379              *    fbno           fend
0380              *
0381              * No unbusy region in extent, return failure.
0382              */
0383             if (fend <= bend)
0384                 goto fail;
0385 
0386             /*
0387              * Case 5:
0388              *    bbno           bend
0389              *    +BBBBBBBBBBBBBBBBB+
0390              *        +----------------------+
0391              *        fbno                fend
0392              *
0393              * Case 6:
0394              *    bbno           bend
0395              *    +BBBBBBBBBBBBBBBBB+
0396              *    +--------------------------+
0397              *    fbno                    fend
0398              *
0399              * Needs to be trimmed to:
0400              *                       +-------+
0401              *                       fbno fend
0402              */
0403             fbno = bend;
0404         } else if (bend >= fend) {
0405             /* end overlap */
0406 
0407             /*
0408              * Case 7:
0409              *             bbno           bend
0410              *             +BBBBBBBBBBBBBBBBB+
0411              *    +------------------+
0412              *    fbno            fend
0413              *
0414              * Case 8:
0415              *             bbno           bend
0416              *             +BBBBBBBBBBBBBBBBB+
0417              *    +--------------------------+
0418              *    fbno                    fend
0419              *
0420              * Needs to be trimmed to:
0421              *    +-------+
0422              *    fbno fend
0423              */
0424             fend = bbno;
0425         } else {
0426             /* middle overlap */
0427 
0428             /*
0429              * Case 9:
0430              *             bbno           bend
0431              *             +BBBBBBBBBBBBBBBBB+
0432              *    +-----------------------------------+
0433              *    fbno                             fend
0434              *
0435              * Can be trimmed to:
0436              *    +-------+        OR         +-------+
0437              *    fbno fend                   fbno fend
0438              *
0439              * Backward allocation leads to significant
0440              * fragmentation of directories, which degrades
0441              * directory performance, therefore we always want to
0442              * choose the option that produces forward allocation
0443              * patterns.
0444              * Preferring the lower bno extent will make the next
0445              * request use "fend" as the start of the next
0446              * allocation;  if the segment is no longer busy at
0447              * that point, we'll get a contiguous allocation, but
0448              * even if it is still busy, we will get a forward
0449              * allocation.
0450              * We try to avoid choosing the segment at "bend",
0451              * because that can lead to the next allocation
0452              * taking the segment at "fbno", which would be a
0453              * backward allocation.  We only use the segment at
0454              * "fbno" if it is much larger than the current
0455              * requested size, because in that case there's a
0456              * good chance subsequent allocations will be
0457              * contiguous.
0458              */
0459             if (bbno - fbno >= args->maxlen) {
0460                 /* left candidate fits perfect */
0461                 fend = bbno;
0462             } else if (fend - bend >= args->maxlen * 4) {
0463                 /* right candidate has enough free space */
0464                 fbno = bend;
0465             } else if (bbno - fbno >= args->minlen) {
0466                 /* left candidate fits minimum requirement */
0467                 fend = bbno;
0468             } else {
0469                 goto fail;
0470             }
0471         }
0472 
0473         flen = fend - fbno;
0474     }
0475 out:
0476 
0477     if (fbno != *bno || flen != *len) {
0478         trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
0479                       fbno, flen);
0480         *bno = fbno;
0481         *len = flen;
0482         *busy_gen = args->pag->pagb_gen;
0483         ret = true;
0484     }
0485     spin_unlock(&args->pag->pagb_lock);
0486     return ret;
0487 fail:
0488     /*
0489      * Return a zero extent length as failure indications.  All callers
0490      * re-check if the trimmed extent satisfies the minlen requirement.
0491      */
0492     flen = 0;
0493     goto out;
0494 }
0495 
0496 STATIC void
0497 xfs_extent_busy_clear_one(
0498     struct xfs_mount    *mp,
0499     struct xfs_perag    *pag,
0500     struct xfs_extent_busy  *busyp)
0501 {
0502     if (busyp->length) {
0503         trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
0504                         busyp->length);
0505         rb_erase(&busyp->rb_node, &pag->pagb_tree);
0506     }
0507 
0508     list_del_init(&busyp->list);
0509     kmem_free(busyp);
0510 }
0511 
0512 static void
0513 xfs_extent_busy_put_pag(
0514     struct xfs_perag    *pag,
0515     bool            wakeup)
0516         __releases(pag->pagb_lock)
0517 {
0518     if (wakeup) {
0519         pag->pagb_gen++;
0520         wake_up_all(&pag->pagb_wait);
0521     }
0522 
0523     spin_unlock(&pag->pagb_lock);
0524     xfs_perag_put(pag);
0525 }
0526 
0527 /*
0528  * Remove all extents on the passed in list from the busy extents tree.
0529  * If do_discard is set skip extents that need to be discarded, and mark
0530  * these as undergoing a discard operation instead.
0531  */
0532 void
0533 xfs_extent_busy_clear(
0534     struct xfs_mount    *mp,
0535     struct list_head    *list,
0536     bool            do_discard)
0537 {
0538     struct xfs_extent_busy  *busyp, *n;
0539     struct xfs_perag    *pag = NULL;
0540     xfs_agnumber_t      agno = NULLAGNUMBER;
0541     bool            wakeup = false;
0542 
0543     list_for_each_entry_safe(busyp, n, list, list) {
0544         if (busyp->agno != agno) {
0545             if (pag)
0546                 xfs_extent_busy_put_pag(pag, wakeup);
0547             agno = busyp->agno;
0548             pag = xfs_perag_get(mp, agno);
0549             spin_lock(&pag->pagb_lock);
0550             wakeup = false;
0551         }
0552 
0553         if (do_discard && busyp->length &&
0554             !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
0555             busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
0556         } else {
0557             xfs_extent_busy_clear_one(mp, pag, busyp);
0558             wakeup = true;
0559         }
0560     }
0561 
0562     if (pag)
0563         xfs_extent_busy_put_pag(pag, wakeup);
0564 }
0565 
0566 /*
0567  * Flush out all busy extents for this AG.
0568  */
0569 void
0570 xfs_extent_busy_flush(
0571     struct xfs_mount    *mp,
0572     struct xfs_perag    *pag,
0573     unsigned        busy_gen)
0574 {
0575     DEFINE_WAIT     (wait);
0576     int         error;
0577 
0578     error = xfs_log_force(mp, XFS_LOG_SYNC);
0579     if (error)
0580         return;
0581 
0582     do {
0583         prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
0584         if  (busy_gen != READ_ONCE(pag->pagb_gen))
0585             break;
0586         schedule();
0587     } while (1);
0588 
0589     finish_wait(&pag->pagb_wait, &wait);
0590 }
0591 
0592 void
0593 xfs_extent_busy_wait_all(
0594     struct xfs_mount    *mp)
0595 {
0596     struct xfs_perag    *pag;
0597     DEFINE_WAIT     (wait);
0598     xfs_agnumber_t      agno;
0599 
0600     for_each_perag(mp, agno, pag) {
0601         do {
0602             prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
0603             if  (RB_EMPTY_ROOT(&pag->pagb_tree))
0604                 break;
0605             schedule();
0606         } while (1);
0607         finish_wait(&pag->pagb_wait, &wait);
0608     }
0609 }
0610 
0611 /*
0612  * Callback for list_sort to sort busy extents by the AG they reside in.
0613  */
0614 int
0615 xfs_extent_busy_ag_cmp(
0616     void            *priv,
0617     const struct list_head  *l1,
0618     const struct list_head  *l2)
0619 {
0620     struct xfs_extent_busy  *b1 =
0621         container_of(l1, struct xfs_extent_busy, list);
0622     struct xfs_extent_busy  *b2 =
0623         container_of(l2, struct xfs_extent_busy, list);
0624     s32 diff;
0625 
0626     diff = b1->agno - b2->agno;
0627     if (!diff)
0628         diff = b1->bno - b2->bno;
0629     return diff;
0630 }