Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * Copyright (C) 2019 Oracle.  All Rights Reserved.
0004  * Author: Darrick J. Wong <darrick.wong@oracle.com>
0005  */
0006 #include "xfs.h"
0007 #include "xfs_fs.h"
0008 #include "xfs_shared.h"
0009 #include "xfs_format.h"
0010 #include "xfs_log_format.h"
0011 #include "xfs_trans_resv.h"
0012 #include "xfs_mount.h"
0013 #include "xfs_inode.h"
0014 #include "xfs_btree.h"
0015 #include "xfs_ialloc.h"
0016 #include "xfs_ialloc_btree.h"
0017 #include "xfs_iwalk.h"
0018 #include "xfs_error.h"
0019 #include "xfs_trace.h"
0020 #include "xfs_icache.h"
0021 #include "xfs_health.h"
0022 #include "xfs_trans.h"
0023 #include "xfs_pwork.h"
0024 #include "xfs_ag.h"
0025 
0026 /*
0027  * Walking Inodes in the Filesystem
0028  * ================================
0029  *
0030  * This iterator function walks a subset of filesystem inodes in increasing
0031  * order from @startino until there are no more inodes.  For each allocated
0032  * inode it finds, it calls a walk function with the relevant inode number and
0033  * a pointer to caller-provided data.  The walk function can return the usual
0034  * negative error code to stop the iteration; 0 to continue the iteration; or
0035  * -ECANCELED to stop the iteration.  This return value is returned to the
0036  * caller.
0037  *
0038  * Internally, we allow the walk function to do anything, which means that we
0039  * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
0040  * therefore cache the inobt records in kernel memory and only call the walk
0041  * function when our memory buffer is full.  @nr_recs is the number of records
0042  * that we've cached, and @sz_recs is the size of our cache.
0043  *
0044  * It is the responsibility of the walk function to ensure it accesses
0045  * allocated inodes, as the inobt records may be stale by the time they are
0046  * acted upon.
0047  */
0048 
0049 struct xfs_iwalk_ag {
0050     /* parallel work control data; will be null if single threaded */
0051     struct xfs_pwork        pwork;
0052 
0053     struct xfs_mount        *mp;
0054     struct xfs_trans        *tp;
0055     struct xfs_perag        *pag;
0056 
0057     /* Where do we start the traversal? */
0058     xfs_ino_t           startino;
0059 
0060     /* What was the last inode number we saw when iterating the inobt? */
0061     xfs_ino_t           lastino;
0062 
0063     /* Array of inobt records we cache. */
0064     struct xfs_inobt_rec_incore *recs;
0065 
0066     /* Number of entries allocated for the @recs array. */
0067     unsigned int            sz_recs;
0068 
0069     /* Number of entries in the @recs array that are in use. */
0070     unsigned int            nr_recs;
0071 
0072     /* Inode walk function and data pointer. */
0073     xfs_iwalk_fn            iwalk_fn;
0074     xfs_inobt_walk_fn       inobt_walk_fn;
0075     void                *data;
0076 
0077     /*
0078      * Make it look like the inodes up to startino are free so that
0079      * bulkstat can start its inode iteration at the correct place without
0080      * needing to special case everywhere.
0081      */
0082     unsigned int            trim_start:1;
0083 
0084     /* Skip empty inobt records? */
0085     unsigned int            skip_empty:1;
0086 
0087     /* Drop the (hopefully empty) transaction when calling iwalk_fn. */
0088     unsigned int            drop_trans:1;
0089 };
0090 
0091 /*
0092  * Loop over all clusters in a chunk for a given incore inode allocation btree
0093  * record.  Do a readahead if there are any allocated inodes in that cluster.
0094  */
0095 STATIC void
0096 xfs_iwalk_ichunk_ra(
0097     struct xfs_mount        *mp,
0098     struct xfs_perag        *pag,
0099     struct xfs_inobt_rec_incore *irec)
0100 {
0101     struct xfs_ino_geometry     *igeo = M_IGEO(mp);
0102     xfs_agblock_t           agbno;
0103     struct blk_plug         plug;
0104     int             i;  /* inode chunk index */
0105 
0106     agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
0107 
0108     blk_start_plug(&plug);
0109     for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
0110         xfs_inofree_t   imask;
0111 
0112         imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
0113         if (imask & ~irec->ir_free) {
0114             xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
0115                     igeo->blocks_per_cluster,
0116                     &xfs_inode_buf_ops);
0117         }
0118         agbno += igeo->blocks_per_cluster;
0119     }
0120     blk_finish_plug(&plug);
0121 }
0122 
0123 /*
0124  * Set the bits in @irec's free mask that correspond to the inodes before
0125  * @agino so that we skip them.  This is how we restart an inode walk that was
0126  * interrupted in the middle of an inode record.
0127  */
0128 STATIC void
0129 xfs_iwalk_adjust_start(
0130     xfs_agino_t         agino,  /* starting inode of chunk */
0131     struct xfs_inobt_rec_incore *irec)  /* btree record */
0132 {
0133     int             idx;    /* index into inode chunk */
0134     int             i;
0135 
0136     idx = agino - irec->ir_startino;
0137 
0138     /*
0139      * We got a right chunk with some left inodes allocated at it.  Grab
0140      * the chunk record.  Mark all the uninteresting inodes free because
0141      * they're before our start point.
0142      */
0143     for (i = 0; i < idx; i++) {
0144         if (XFS_INOBT_MASK(i) & ~irec->ir_free)
0145             irec->ir_freecount++;
0146     }
0147 
0148     irec->ir_free |= xfs_inobt_maskn(0, idx);
0149 }
0150 
0151 /* Allocate memory for a walk. */
0152 STATIC int
0153 xfs_iwalk_alloc(
0154     struct xfs_iwalk_ag *iwag)
0155 {
0156     size_t          size;
0157 
0158     ASSERT(iwag->recs == NULL);
0159     iwag->nr_recs = 0;
0160 
0161     /* Allocate a prefetch buffer for inobt records. */
0162     size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
0163     iwag->recs = kmem_alloc(size, KM_MAYFAIL);
0164     if (iwag->recs == NULL)
0165         return -ENOMEM;
0166 
0167     return 0;
0168 }
0169 
0170 /* Free memory we allocated for a walk. */
0171 STATIC void
0172 xfs_iwalk_free(
0173     struct xfs_iwalk_ag *iwag)
0174 {
0175     kmem_free(iwag->recs);
0176     iwag->recs = NULL;
0177 }
0178 
0179 /* For each inuse inode in each cached inobt record, call our function. */
0180 STATIC int
0181 xfs_iwalk_ag_recs(
0182     struct xfs_iwalk_ag *iwag)
0183 {
0184     struct xfs_mount    *mp = iwag->mp;
0185     struct xfs_trans    *tp = iwag->tp;
0186     struct xfs_perag    *pag = iwag->pag;
0187     xfs_ino_t       ino;
0188     unsigned int        i, j;
0189     int         error;
0190 
0191     for (i = 0; i < iwag->nr_recs; i++) {
0192         struct xfs_inobt_rec_incore *irec = &iwag->recs[i];
0193 
0194         trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec);
0195 
0196         if (xfs_pwork_want_abort(&iwag->pwork))
0197             return 0;
0198 
0199         if (iwag->inobt_walk_fn) {
0200             error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec,
0201                     iwag->data);
0202             if (error)
0203                 return error;
0204         }
0205 
0206         if (!iwag->iwalk_fn)
0207             continue;
0208 
0209         for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
0210             if (xfs_pwork_want_abort(&iwag->pwork))
0211                 return 0;
0212 
0213             /* Skip if this inode is free */
0214             if (XFS_INOBT_MASK(j) & irec->ir_free)
0215                 continue;
0216 
0217             /* Otherwise call our function. */
0218             ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
0219                         irec->ir_startino + j);
0220             error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
0221             if (error)
0222                 return error;
0223         }
0224     }
0225 
0226     return 0;
0227 }
0228 
0229 /* Delete cursor and let go of AGI. */
0230 static inline void
0231 xfs_iwalk_del_inobt(
0232     struct xfs_trans    *tp,
0233     struct xfs_btree_cur    **curpp,
0234     struct xfs_buf      **agi_bpp,
0235     int         error)
0236 {
0237     if (*curpp) {
0238         xfs_btree_del_cursor(*curpp, error);
0239         *curpp = NULL;
0240     }
0241     if (*agi_bpp) {
0242         xfs_trans_brelse(tp, *agi_bpp);
0243         *agi_bpp = NULL;
0244     }
0245 }
0246 
0247 /*
0248  * Set ourselves up for walking inobt records starting from a given point in
0249  * the filesystem.
0250  *
0251  * If caller passed in a nonzero start inode number, load the record from the
0252  * inobt and make the record look like all the inodes before agino are free so
0253  * that we skip them, and then move the cursor to the next inobt record.  This
0254  * is how we support starting an iwalk in the middle of an inode chunk.
0255  *
0256  * If the caller passed in a start number of zero, move the cursor to the first
0257  * inobt record.
0258  *
0259  * The caller is responsible for cleaning up the cursor and buffer pointer
0260  * regardless of the error status.
0261  */
0262 STATIC int
0263 xfs_iwalk_ag_start(
0264     struct xfs_iwalk_ag *iwag,
0265     xfs_agino_t     agino,
0266     struct xfs_btree_cur    **curpp,
0267     struct xfs_buf      **agi_bpp,
0268     int         *has_more)
0269 {
0270     struct xfs_mount    *mp = iwag->mp;
0271     struct xfs_trans    *tp = iwag->tp;
0272     struct xfs_perag    *pag = iwag->pag;
0273     struct xfs_inobt_rec_incore *irec;
0274     int         error;
0275 
0276     /* Set up a fresh cursor and empty the inobt cache. */
0277     iwag->nr_recs = 0;
0278     error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp);
0279     if (error)
0280         return error;
0281 
0282     /* Starting at the beginning of the AG?  That's easy! */
0283     if (agino == 0)
0284         return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
0285 
0286     /*
0287      * Otherwise, we have to grab the inobt record where we left off, stuff
0288      * the record into our cache, and then see if there are more records.
0289      * We require a lookup cache of at least two elements so that the
0290      * caller doesn't have to deal with tearing down the cursor to walk the
0291      * records.
0292      */
0293     error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
0294     if (error)
0295         return error;
0296 
0297     /*
0298      * If the LE lookup at @agino yields no records, jump ahead to the
0299      * inobt cursor increment to see if there are more records to process.
0300      */
0301     if (!*has_more)
0302         goto out_advance;
0303 
0304     /* Get the record, should always work */
0305     irec = &iwag->recs[iwag->nr_recs];
0306     error = xfs_inobt_get_rec(*curpp, irec, has_more);
0307     if (error)
0308         return error;
0309     if (XFS_IS_CORRUPT(mp, *has_more != 1))
0310         return -EFSCORRUPTED;
0311 
0312     iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
0313                 irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
0314 
0315     /*
0316      * If the LE lookup yielded an inobt record before the cursor position,
0317      * skip it and see if there's another one after it.
0318      */
0319     if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
0320         goto out_advance;
0321 
0322     /*
0323      * If agino fell in the middle of the inode record, make it look like
0324      * the inodes up to agino are free so that we don't return them again.
0325      */
0326     if (iwag->trim_start)
0327         xfs_iwalk_adjust_start(agino, irec);
0328 
0329     /*
0330      * The prefetch calculation is supposed to give us a large enough inobt
0331      * record cache that grab_ichunk can stage a partial first record and
0332      * the loop body can cache a record without having to check for cache
0333      * space until after it reads an inobt record.
0334      */
0335     iwag->nr_recs++;
0336     ASSERT(iwag->nr_recs < iwag->sz_recs);
0337 
0338 out_advance:
0339     return xfs_btree_increment(*curpp, 0, has_more);
0340 }
0341 
0342 /*
0343  * The inobt record cache is full, so preserve the inobt cursor state and
0344  * run callbacks on the cached inobt records.  When we're done, restore the
0345  * cursor state to wherever the cursor would have been had the cache not been
0346  * full (and therefore we could've just incremented the cursor) if *@has_more
0347  * is true.  On exit, *@has_more will indicate whether or not the caller should
0348  * try for more inode records.
0349  */
0350 STATIC int
0351 xfs_iwalk_run_callbacks(
0352     struct xfs_iwalk_ag     *iwag,
0353     struct xfs_btree_cur        **curpp,
0354     struct xfs_buf          **agi_bpp,
0355     int             *has_more)
0356 {
0357     struct xfs_mount        *mp = iwag->mp;
0358     struct xfs_inobt_rec_incore *irec;
0359     xfs_agino_t         next_agino;
0360     int             error;
0361 
0362     next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
0363 
0364     ASSERT(iwag->nr_recs > 0);
0365 
0366     /* Delete cursor but remember the last record we cached... */
0367     xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0);
0368     irec = &iwag->recs[iwag->nr_recs - 1];
0369     ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
0370 
0371     if (iwag->drop_trans) {
0372         xfs_trans_cancel(iwag->tp);
0373         iwag->tp = NULL;
0374     }
0375 
0376     error = xfs_iwalk_ag_recs(iwag);
0377     if (error)
0378         return error;
0379 
0380     /* ...empty the cache... */
0381     iwag->nr_recs = 0;
0382 
0383     if (!has_more)
0384         return 0;
0385 
0386     if (iwag->drop_trans) {
0387         error = xfs_trans_alloc_empty(mp, &iwag->tp);
0388         if (error)
0389             return error;
0390     }
0391 
0392     /* ...and recreate the cursor just past where we left off. */
0393     error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp,
0394             agi_bpp);
0395     if (error)
0396         return error;
0397 
0398     return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
0399 }
0400 
0401 /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
0402 STATIC int
0403 xfs_iwalk_ag(
0404     struct xfs_iwalk_ag     *iwag)
0405 {
0406     struct xfs_mount        *mp = iwag->mp;
0407     struct xfs_perag        *pag = iwag->pag;
0408     struct xfs_buf          *agi_bp = NULL;
0409     struct xfs_btree_cur        *cur = NULL;
0410     xfs_agino_t         agino;
0411     int             has_more;
0412     int             error = 0;
0413 
0414     /* Set up our cursor at the right place in the inode btree. */
0415     ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino));
0416     agino = XFS_INO_TO_AGINO(mp, iwag->startino);
0417     error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more);
0418 
0419     while (!error && has_more) {
0420         struct xfs_inobt_rec_incore *irec;
0421         xfs_ino_t           rec_fsino;
0422 
0423         cond_resched();
0424         if (xfs_pwork_want_abort(&iwag->pwork))
0425             goto out;
0426 
0427         /* Fetch the inobt record. */
0428         irec = &iwag->recs[iwag->nr_recs];
0429         error = xfs_inobt_get_rec(cur, irec, &has_more);
0430         if (error || !has_more)
0431             break;
0432 
0433         /* Make sure that we always move forward. */
0434         rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
0435         if (iwag->lastino != NULLFSINO &&
0436             XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
0437             error = -EFSCORRUPTED;
0438             goto out;
0439         }
0440         iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
0441 
0442         /* No allocated inodes in this chunk; skip it. */
0443         if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
0444             error = xfs_btree_increment(cur, 0, &has_more);
0445             if (error)
0446                 break;
0447             continue;
0448         }
0449 
0450         /*
0451          * Start readahead for this inode chunk in anticipation of
0452          * walking the inodes.
0453          */
0454         if (iwag->iwalk_fn)
0455             xfs_iwalk_ichunk_ra(mp, pag, irec);
0456 
0457         /*
0458          * If there's space in the buffer for more records, increment
0459          * the btree cursor and grab more.
0460          */
0461         if (++iwag->nr_recs < iwag->sz_recs) {
0462             error = xfs_btree_increment(cur, 0, &has_more);
0463             if (error || !has_more)
0464                 break;
0465             continue;
0466         }
0467 
0468         /*
0469          * Otherwise, we need to save cursor state and run the callback
0470          * function on the cached records.  The run_callbacks function
0471          * is supposed to return a cursor pointing to the record where
0472          * we would be if we had been able to increment like above.
0473          */
0474         ASSERT(has_more);
0475         error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
0476     }
0477 
0478     if (iwag->nr_recs == 0 || error)
0479         goto out;
0480 
0481     /* Walk the unprocessed records in the cache. */
0482     error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
0483 
0484 out:
0485     xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error);
0486     return error;
0487 }
0488 
0489 /*
0490  * We experimentally determined that the reduction in ioctl call overhead
0491  * diminishes when userspace asks for more than 2048 inodes, so we'll cap
0492  * prefetch at this point.
0493  */
0494 #define IWALK_MAX_INODE_PREFETCH    (2048U)
0495 
0496 /*
0497  * Given the number of inodes to prefetch, set the number of inobt records that
0498  * we cache in memory, which controls the number of inodes we try to read
0499  * ahead.  Set the maximum if @inodes == 0.
0500  */
0501 static inline unsigned int
0502 xfs_iwalk_prefetch(
0503     unsigned int        inodes)
0504 {
0505     unsigned int        inobt_records;
0506 
0507     /*
0508      * If the caller didn't tell us the number of inodes they wanted,
0509      * assume the maximum prefetch possible for best performance.
0510      * Otherwise, cap prefetch at that maximum so that we don't start an
0511      * absurd amount of prefetch.
0512      */
0513     if (inodes == 0)
0514         inodes = IWALK_MAX_INODE_PREFETCH;
0515     inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
0516 
0517     /* Round the inode count up to a full chunk. */
0518     inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
0519 
0520     /*
0521      * In order to convert the number of inodes to prefetch into an
0522      * estimate of the number of inobt records to cache, we require a
0523      * conversion factor that reflects our expectations of the average
0524      * loading factor of an inode chunk.  Based on data gathered, most
0525      * (but not all) filesystems manage to keep the inode chunks totally
0526      * full, so we'll underestimate slightly so that our readahead will
0527      * still deliver the performance we want on aging filesystems:
0528      *
0529      * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
0530      *
0531      * The funny math is to avoid integer division.
0532      */
0533     inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
0534 
0535     /*
0536      * Allocate enough space to prefetch at least two inobt records so that
0537      * we can cache both the record where the iwalk started and the next
0538      * record.  This simplifies the AG inode walk loop setup code.
0539      */
0540     return max(inobt_records, 2U);
0541 }
0542 
0543 /*
0544  * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
0545  * will be called for each allocated inode, being passed the inode's number and
0546  * @data.  @max_prefetch controls how many inobt records' worth of inodes we
0547  * try to readahead.
0548  */
0549 int
0550 xfs_iwalk(
0551     struct xfs_mount    *mp,
0552     struct xfs_trans    *tp,
0553     xfs_ino_t       startino,
0554     unsigned int        flags,
0555     xfs_iwalk_fn        iwalk_fn,
0556     unsigned int        inode_records,
0557     void            *data)
0558 {
0559     struct xfs_iwalk_ag iwag = {
0560         .mp     = mp,
0561         .tp     = tp,
0562         .iwalk_fn   = iwalk_fn,
0563         .data       = data,
0564         .startino   = startino,
0565         .sz_recs    = xfs_iwalk_prefetch(inode_records),
0566         .trim_start = 1,
0567         .skip_empty = 1,
0568         .pwork      = XFS_PWORK_SINGLE_THREADED,
0569         .lastino    = NULLFSINO,
0570     };
0571     struct xfs_perag    *pag;
0572     xfs_agnumber_t      agno = XFS_INO_TO_AGNO(mp, startino);
0573     int         error;
0574 
0575     ASSERT(agno < mp->m_sb.sb_agcount);
0576     ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
0577 
0578     error = xfs_iwalk_alloc(&iwag);
0579     if (error)
0580         return error;
0581 
0582     for_each_perag_from(mp, agno, pag) {
0583         iwag.pag = pag;
0584         error = xfs_iwalk_ag(&iwag);
0585         if (error)
0586             break;
0587         iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
0588         if (flags & XFS_INOBT_WALK_SAME_AG)
0589             break;
0590         iwag.pag = NULL;
0591     }
0592 
0593     if (iwag.pag)
0594         xfs_perag_put(pag);
0595     xfs_iwalk_free(&iwag);
0596     return error;
0597 }
0598 
0599 /* Run per-thread iwalk work. */
0600 static int
0601 xfs_iwalk_ag_work(
0602     struct xfs_mount    *mp,
0603     struct xfs_pwork    *pwork)
0604 {
0605     struct xfs_iwalk_ag *iwag;
0606     int         error = 0;
0607 
0608     iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
0609     if (xfs_pwork_want_abort(pwork))
0610         goto out;
0611 
0612     error = xfs_iwalk_alloc(iwag);
0613     if (error)
0614         goto out;
0615     /*
0616      * Grab an empty transaction so that we can use its recursive buffer
0617      * locking abilities to detect cycles in the inobt without deadlocking.
0618      */
0619     error = xfs_trans_alloc_empty(mp, &iwag->tp);
0620     if (error)
0621         goto out;
0622     iwag->drop_trans = 1;
0623 
0624     error = xfs_iwalk_ag(iwag);
0625     if (iwag->tp)
0626         xfs_trans_cancel(iwag->tp);
0627     xfs_iwalk_free(iwag);
0628 out:
0629     xfs_perag_put(iwag->pag);
0630     kmem_free(iwag);
0631     return error;
0632 }
0633 
0634 /*
0635  * Walk all the inodes in the filesystem using multiple threads to process each
0636  * AG.
0637  */
0638 int
0639 xfs_iwalk_threaded(
0640     struct xfs_mount    *mp,
0641     xfs_ino_t       startino,
0642     unsigned int        flags,
0643     xfs_iwalk_fn        iwalk_fn,
0644     unsigned int        inode_records,
0645     bool            polled,
0646     void            *data)
0647 {
0648     struct xfs_pwork_ctl    pctl;
0649     struct xfs_perag    *pag;
0650     xfs_agnumber_t      agno = XFS_INO_TO_AGNO(mp, startino);
0651     int         error;
0652 
0653     ASSERT(agno < mp->m_sb.sb_agcount);
0654     ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
0655 
0656     error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
0657     if (error)
0658         return error;
0659 
0660     for_each_perag_from(mp, agno, pag) {
0661         struct xfs_iwalk_ag *iwag;
0662 
0663         if (xfs_pwork_ctl_want_abort(&pctl))
0664             break;
0665 
0666         iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
0667         iwag->mp = mp;
0668 
0669         /*
0670          * perag is being handed off to async work, so take another
0671          * reference for the async work to release.
0672          */
0673         atomic_inc(&pag->pag_ref);
0674         iwag->pag = pag;
0675         iwag->iwalk_fn = iwalk_fn;
0676         iwag->data = data;
0677         iwag->startino = startino;
0678         iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
0679         iwag->lastino = NULLFSINO;
0680         xfs_pwork_queue(&pctl, &iwag->pwork);
0681         startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
0682         if (flags & XFS_INOBT_WALK_SAME_AG)
0683             break;
0684     }
0685     if (pag)
0686         xfs_perag_put(pag);
0687     if (polled)
0688         xfs_pwork_poll(&pctl);
0689     return xfs_pwork_destroy(&pctl);
0690 }
0691 
0692 /*
0693  * Allow callers to cache up to a page's worth of inobt records.  This reflects
0694  * the existing inumbers prefetching behavior.  Since the inobt walk does not
0695  * itself do anything with the inobt records, we can set a fairly high limit
0696  * here.
0697  */
0698 #define MAX_INOBT_WALK_PREFETCH \
0699     (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
0700 
0701 /*
0702  * Given the number of records that the user wanted, set the number of inobt
0703  * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
0704  */
0705 static inline unsigned int
0706 xfs_inobt_walk_prefetch(
0707     unsigned int        inobt_records)
0708 {
0709     /*
0710      * If the caller didn't tell us the number of inobt records they
0711      * wanted, assume the maximum prefetch possible for best performance.
0712      */
0713     if (inobt_records == 0)
0714         inobt_records = MAX_INOBT_WALK_PREFETCH;
0715 
0716     /*
0717      * Allocate enough space to prefetch at least two inobt records so that
0718      * we can cache both the record where the iwalk started and the next
0719      * record.  This simplifies the AG inode walk loop setup code.
0720      */
0721     inobt_records = max(inobt_records, 2U);
0722 
0723     /*
0724      * Cap prefetch at that maximum so that we don't use an absurd amount
0725      * of memory.
0726      */
0727     return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
0728 }
0729 
0730 /*
0731  * Walk all inode btree records in the filesystem starting from @startino.  The
0732  * @inobt_walk_fn will be called for each btree record, being passed the incore
0733  * record and @data.  @max_prefetch controls how many inobt records we try to
0734  * cache ahead of time.
0735  */
0736 int
0737 xfs_inobt_walk(
0738     struct xfs_mount    *mp,
0739     struct xfs_trans    *tp,
0740     xfs_ino_t       startino,
0741     unsigned int        flags,
0742     xfs_inobt_walk_fn   inobt_walk_fn,
0743     unsigned int        inobt_records,
0744     void            *data)
0745 {
0746     struct xfs_iwalk_ag iwag = {
0747         .mp     = mp,
0748         .tp     = tp,
0749         .inobt_walk_fn  = inobt_walk_fn,
0750         .data       = data,
0751         .startino   = startino,
0752         .sz_recs    = xfs_inobt_walk_prefetch(inobt_records),
0753         .pwork      = XFS_PWORK_SINGLE_THREADED,
0754         .lastino    = NULLFSINO,
0755     };
0756     struct xfs_perag    *pag;
0757     xfs_agnumber_t      agno = XFS_INO_TO_AGNO(mp, startino);
0758     int         error;
0759 
0760     ASSERT(agno < mp->m_sb.sb_agcount);
0761     ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
0762 
0763     error = xfs_iwalk_alloc(&iwag);
0764     if (error)
0765         return error;
0766 
0767     for_each_perag_from(mp, agno, pag) {
0768         iwag.pag = pag;
0769         error = xfs_iwalk_ag(&iwag);
0770         if (error)
0771             break;
0772         iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
0773         if (flags & XFS_INOBT_WALK_SAME_AG)
0774             break;
0775         iwag.pag = NULL;
0776     }
0777 
0778     if (iwag.pag)
0779         xfs_perag_put(pag);
0780     xfs_iwalk_free(&iwag);
0781     return error;
0782 }