fs/xfs/xfs_fsops.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
0004  * All Rights Reserved.
0005  */
0006 #include "xfs.h"
0007 #include "xfs_fs.h"
0008 #include "xfs_shared.h"
0009 #include "xfs_format.h"
0010 #include "xfs_log_format.h"
0011 #include "xfs_trans_resv.h"
0012 #include "xfs_sb.h"
0013 #include "xfs_mount.h"
0014 #include "xfs_trans.h"
0015 #include "xfs_error.h"
0016 #include "xfs_alloc.h"
0017 #include "xfs_fsops.h"
0018 #include "xfs_trans_space.h"
0019 #include "xfs_log.h"
0020 #include "xfs_log_priv.h"
0021 #include "xfs_ag.h"
0022 #include "xfs_ag_resv.h"
0023 #include "xfs_trace.h"
0024
0025 /*
0026  * Write new AG headers to disk. Non-transactional, but need to be
0027  * written and completed prior to the growfs transaction being logged.
0028  * To do this, we use a delayed write buffer list and wait for
0029  * submission and IO completion of the list as a whole. This allows the
0030  * IO subsystem to merge all the AG headers in a single AG into a single
0031  * IO and hide most of the latency of the IO from us.
0032  *
0033  * This also means that if we get an error whilst building the buffer
0034  * list to write, we can cancel the entire list without having written
0035  * anything.
0036  */
0037 static int
0038 xfs_resizefs_init_new_ags(
0039     struct xfs_trans    *tp,
0040     struct aghdr_init_data  *id,
0041     xfs_agnumber_t      oagcount,
0042     xfs_agnumber_t      nagcount,
0043     xfs_rfsblock_t      delta,
0044     struct xfs_perag    *last_pag,
0045     bool            *lastag_extended)
0046 {
0047     struct xfs_mount    *mp = tp->t_mountp;
0048     xfs_rfsblock_t      nb = mp->m_sb.sb_dblocks + delta;
0049     int         error;
0050
0051     *lastag_extended = false;
0052
0053     INIT_LIST_HEAD(&id->buffer_list);
0054     for (id->agno = nagcount - 1;
0055          id->agno >= oagcount;
0056          id->agno--, delta -= id->agsize) {
0057
0058         if (id->agno == nagcount - 1)
0059             id->agsize = nb - (id->agno *
0060                     (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
0061         else
0062             id->agsize = mp->m_sb.sb_agblocks;
0063
0064         error = xfs_ag_init_headers(mp, id);
0065         if (error) {
0066             xfs_buf_delwri_cancel(&id->buffer_list);
0067             return error;
0068         }
0069     }
0070
0071     error = xfs_buf_delwri_submit(&id->buffer_list);
0072     if (error)
0073         return error;
0074
0075     if (delta) {
0076         *lastag_extended = true;
0077         error = xfs_ag_extend_space(last_pag, tp, delta);
0078     }
0079     return error;
0080 }
0081
0082 /*
0083  * growfs operations
0084  */
0085 static int
0086 xfs_growfs_data_private(
0087     struct xfs_mount    *mp,        /* mount point for filesystem */
0088     struct xfs_growfs_data  *in)        /* growfs data input struct */
0089 {
0090     struct xfs_buf      *bp;
0091     int         error;
0092     xfs_agnumber_t      nagcount;
0093     xfs_agnumber_t      nagimax = 0;
0094     xfs_rfsblock_t      nb, nb_div, nb_mod;
0095     int64_t         delta;
0096     bool            lastag_extended;
0097     xfs_agnumber_t      oagcount;
0098     struct xfs_trans    *tp;
0099     struct aghdr_init_data  id = {};
0100     struct xfs_perag    *last_pag;
0101
0102     nb = in->newblocks;
0103     error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
0104     if (error)
0105         return error;
0106
0107     if (nb > mp->m_sb.sb_dblocks) {
0108         error = xfs_buf_read_uncached(mp->m_ddev_targp,
0109                 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
0110                 XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
0111         if (error)
0112             return error;
0113         xfs_buf_relse(bp);
0114     }
0115
0116     nb_div = nb;
0117     nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
0118     nagcount = nb_div + (nb_mod != 0);
0119     if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
0120         nagcount--;
0121         nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
0122     }
0123     delta = nb - mp->m_sb.sb_dblocks;
0124     /*
0125      * Reject filesystems with a single AG because they are not
0126      * supported, and reject a shrink operation that would cause a
0127      * filesystem to become unsupported.
0128      */
0129     if (delta < 0 && nagcount < 2)
0130         return -EINVAL;
0131
0132     oagcount = mp->m_sb.sb_agcount;
0133     /* allocate the new per-ag structures */
0134     if (nagcount > oagcount) {
0135         error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
0136         if (error)
0137             return error;
0138     } else if (nagcount < oagcount) {
0139         /* TODO: shrinking the entire AGs hasn't yet completed */
0140         return -EINVAL;
0141     }
0142
0143     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
0144             (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0,
0145             XFS_TRANS_RESERVE, &tp);
0146     if (error)
0147         return error;
0148
0149     last_pag = xfs_perag_get(mp, oagcount - 1);
0150     if (delta > 0) {
0151         error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
0152                 delta, last_pag, &lastag_extended);
0153     } else {
0154         xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
0155     "EXPERIMENTAL online shrink feature in use. Use at your own risk!");
0156
0157         error = xfs_ag_shrink_space(last_pag, &tp, -delta);
0158     }
0159     xfs_perag_put(last_pag);
0160     if (error)
0161         goto out_trans_cancel;
0162
0163     /*
0164      * Update changed superblock fields transactionally. These are not
0165      * seen by the rest of the world until the transaction commit applies
0166      * them atomically to the superblock.
0167      */
0168     if (nagcount > oagcount)
0169         xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
0170     if (delta)
0171         xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta);
0172     if (id.nfree)
0173         xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
0174
0175     /*
0176      * Sync sb counters now to reflect the updated values. This is
0177      * particularly important for shrink because the write verifier
0178      * will fail if sb_fdblocks is ever larger than sb_dblocks.
0179      */
0180     if (xfs_has_lazysbcount(mp))
0181         xfs_log_sb(tp);
0182
0183     xfs_trans_set_sync(tp);
0184     error = xfs_trans_commit(tp);
0185     if (error)
0186         return error;
0187
0188     /* New allocation groups fully initialized, so update mount struct */
0189     if (nagimax)
0190         mp->m_maxagi = nagimax;
0191     xfs_set_low_space_thresholds(mp);
0192     mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
0193
0194     if (delta > 0) {
0195         /*
0196          * If we expanded the last AG, free the per-AG reservation
0197          * so we can reinitialize it with the new size.
0198          */
0199         if (lastag_extended) {
0200             struct xfs_perag    *pag;
0201
0202             pag = xfs_perag_get(mp, id.agno);
0203             error = xfs_ag_resv_free(pag);
0204             xfs_perag_put(pag);
0205             if (error)
0206                 return error;
0207         }
0208         /*
0209          * Reserve AG metadata blocks. ENOSPC here does not mean there
0210          * was a growfs failure, just that there still isn't space for
0211          * new user data after the grow has been run.
0212          */
0213         error = xfs_fs_reserve_ag_blocks(mp);
0214         if (error == -ENOSPC)
0215             error = 0;
0216     }
0217     return error;
0218
0219 out_trans_cancel:
0220     xfs_trans_cancel(tp);
0221     return error;
0222 }
0223
0224 static int
0225 xfs_growfs_log_private(
0226     struct xfs_mount    *mp,    /* mount point for filesystem */
0227     struct xfs_growfs_log   *in)    /* growfs log input struct */
0228 {
0229     xfs_extlen_t        nb;
0230
0231     nb = in->newblocks;
0232     if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
0233         return -EINVAL;
0234     if (nb == mp->m_sb.sb_logblocks &&
0235         in->isint == (mp->m_sb.sb_logstart != 0))
0236         return -EINVAL;
0237     /*
0238      * Moving the log is hard, need new interfaces to sync
0239      * the log first, hold off all activity while moving it.
0240      * Can have shorter or longer log in the same space,
0241      * or transform internal to external log or vice versa.
0242      */
0243     return -ENOSYS;
0244 }
0245
0246 static int
0247 xfs_growfs_imaxpct(
0248     struct xfs_mount    *mp,
0249     __u32           imaxpct)
0250 {
0251     struct xfs_trans    *tp;
0252     int         dpct;
0253     int         error;
0254
0255     if (imaxpct > 100)
0256         return -EINVAL;
0257
0258     error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
0259             XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
0260     if (error)
0261         return error;
0262
0263     dpct = imaxpct - mp->m_sb.sb_imax_pct;
0264     xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
0265     xfs_trans_set_sync(tp);
0266     return xfs_trans_commit(tp);
0267 }
0268
0269 /*
0270  * protected versions of growfs function acquire and release locks on the mount
0271  * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
0272  * XFS_IOC_FSGROWFSRT
0273  */
0274 int
0275 xfs_growfs_data(
0276     struct xfs_mount    *mp,
0277     struct xfs_growfs_data  *in)
0278 {
0279     int         error = 0;
0280
0281     if (!capable(CAP_SYS_ADMIN))
0282         return -EPERM;
0283     if (!mutex_trylock(&mp->m_growlock))
0284         return -EWOULDBLOCK;
0285
0286     /* update imaxpct separately to the physical grow of the filesystem */
0287     if (in->imaxpct != mp->m_sb.sb_imax_pct) {
0288         error = xfs_growfs_imaxpct(mp, in->imaxpct);
0289         if (error)
0290             goto out_error;
0291     }
0292
0293     if (in->newblocks != mp->m_sb.sb_dblocks) {
0294         error = xfs_growfs_data_private(mp, in);
0295         if (error)
0296             goto out_error;
0297     }
0298
0299     /* Post growfs calculations needed to reflect new state in operations */
0300     if (mp->m_sb.sb_imax_pct) {
0301         uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
0302         do_div(icount, 100);
0303         M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
0304     } else
0305         M_IGEO(mp)->maxicount = 0;
0306
0307     /* Update secondary superblocks now the physical grow has completed */
0308     error = xfs_update_secondary_sbs(mp);
0309
0310 out_error:
0311     /*
0312      * Increment the generation unconditionally, the error could be from
0313      * updating the secondary superblocks, in which case the new size
0314      * is live already.
0315      */
0316     mp->m_generation++;
0317     mutex_unlock(&mp->m_growlock);
0318     return error;
0319 }
0320
0321 int
0322 xfs_growfs_log(
0323     xfs_mount_t     *mp,
0324     struct xfs_growfs_log   *in)
0325 {
0326     int error;
0327
0328     if (!capable(CAP_SYS_ADMIN))
0329         return -EPERM;
0330     if (!mutex_trylock(&mp->m_growlock))
0331         return -EWOULDBLOCK;
0332     error = xfs_growfs_log_private(mp, in);
0333     mutex_unlock(&mp->m_growlock);
0334     return error;
0335 }
0336
0337 /*
0338  * exported through ioctl XFS_IOC_FSCOUNTS
0339  */
0340
0341 void
0342 xfs_fs_counts(
0343     xfs_mount_t     *mp,
0344     xfs_fsop_counts_t   *cnt)
0345 {
0346     cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
0347     cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
0348     cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
0349                         xfs_fdblocks_unavailable(mp);
0350     cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
0351 }
0352
0353 /*
0354  * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
0355  *
0356  * xfs_reserve_blocks is called to set m_resblks
0357  * in the in-core mount table. The number of unused reserved blocks
0358  * is kept in m_resblks_avail.
0359  *
0360  * Reserve the requested number of blocks if available. Otherwise return
0361  * as many as possible to satisfy the request. The actual number
0362  * reserved are returned in outval
0363  *
0364  * A null inval pointer indicates that only the current reserved blocks
0365  * available  should  be returned no settings are changed.
0366  */
0367
0368 int
0369 xfs_reserve_blocks(
0370     xfs_mount_t             *mp,
0371     uint64_t              *inval,
0372     xfs_fsop_resblks_t      *outval)
0373 {
0374     int64_t         lcounter, delta;
0375     int64_t         fdblks_delta = 0;
0376     uint64_t        request;
0377     int64_t         free;
0378     int         error = 0;
0379
0380     /* If inval is null, report current values and return */
0381     if (inval == (uint64_t *)NULL) {
0382         if (!outval)
0383             return -EINVAL;
0384         outval->resblks = mp->m_resblks;
0385         outval->resblks_avail = mp->m_resblks_avail;
0386         return 0;
0387     }
0388
0389     request = *inval;
0390
0391     /*
0392      * With per-cpu counters, this becomes an interesting problem. we need
0393      * to work out if we are freeing or allocation blocks first, then we can
0394      * do the modification as necessary.
0395      *
0396      * We do this under the m_sb_lock so that if we are near ENOSPC, we will
0397      * hold out any changes while we work out what to do. This means that
0398      * the amount of free space can change while we do this, so we need to
0399      * retry if we end up trying to reserve more space than is available.
0400      */
0401     spin_lock(&mp->m_sb_lock);
0402
0403     /*
0404      * If our previous reservation was larger than the current value,
0405      * then move any unused blocks back to the free pool. Modify the resblks
0406      * counters directly since we shouldn't have any problems unreserving
0407      * space.
0408      */
0409     if (mp->m_resblks > request) {
0410         lcounter = mp->m_resblks_avail - request;
0411         if (lcounter  > 0) {        /* release unused blocks */
0412             fdblks_delta = lcounter;
0413             mp->m_resblks_avail -= lcounter;
0414         }
0415         mp->m_resblks = request;
0416         if (fdblks_delta) {
0417             spin_unlock(&mp->m_sb_lock);
0418             error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
0419             spin_lock(&mp->m_sb_lock);
0420         }
0421
0422         goto out;
0423     }
0424
0425     /*
0426      * If the request is larger than the current reservation, reserve the
0427      * blocks before we update the reserve counters. Sample m_fdblocks and
0428      * perform a partial reservation if the request exceeds free space.
0429      *
0430      * The code below estimates how many blocks it can request from
0431      * fdblocks to stash in the reserve pool.  This is a classic TOCTOU
0432      * race since fdblocks updates are not always coordinated via
0433      * m_sb_lock.  Set the reserve size even if there's not enough free
0434      * space to fill it because mod_fdblocks will refill an undersized
0435      * reserve when it can.
0436      */
0437     free = percpu_counter_sum(&mp->m_fdblocks) -
0438                         xfs_fdblocks_unavailable(mp);
0439     delta = request - mp->m_resblks;
0440     mp->m_resblks = request;
0441     if (delta > 0 && free > 0) {
0442         /*
0443          * We'll either succeed in getting space from the free block
0444          * count or we'll get an ENOSPC.  Don't set the reserved flag
0445          * here - we don't want to reserve the extra reserve blocks
0446          * from the reserve.
0447          *
0448          * The desired reserve size can change after we drop the lock.
0449          * Use mod_fdblocks to put the space into the reserve or into
0450          * fdblocks as appropriate.
0451          */
0452         fdblks_delta = min(free, delta);
0453         spin_unlock(&mp->m_sb_lock);
0454         error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
0455         if (!error)
0456             xfs_mod_fdblocks(mp, fdblks_delta, 0);
0457         spin_lock(&mp->m_sb_lock);
0458     }
0459 out:
0460     if (outval) {
0461         outval->resblks = mp->m_resblks;
0462         outval->resblks_avail = mp->m_resblks_avail;
0463     }
0464
0465     spin_unlock(&mp->m_sb_lock);
0466     return error;
0467 }
0468
0469 int
0470 xfs_fs_goingdown(
0471     xfs_mount_t *mp,
0472     uint32_t    inflags)
0473 {
0474     switch (inflags) {
0475     case XFS_FSOP_GOING_FLAGS_DEFAULT: {
0476         if (!freeze_bdev(mp->m_super->s_bdev)) {
0477             xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
0478             thaw_bdev(mp->m_super->s_bdev);
0479         }
0480         break;
0481     }
0482     case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
0483         xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
0484         break;
0485     case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
0486         xfs_force_shutdown(mp,
0487                 SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
0488         break;
0489     default:
0490         return -EINVAL;
0491     }
0492
0493     return 0;
0494 }
0495
0496 /*
0497  * Force a shutdown of the filesystem instantly while keeping the filesystem
0498  * consistent. We don't do an unmount here; just shutdown the shop, make sure
0499  * that absolutely nothing persistent happens to this filesystem after this
0500  * point.
0501  *
0502  * The shutdown state change is atomic, resulting in the first and only the
0503  * first shutdown call processing the shutdown. This means we only shutdown the
0504  * log once as it requires, and we don't spam the logs when multiple concurrent
0505  * shutdowns race to set the shutdown flags.
0506  */
0507 void
0508 xfs_do_force_shutdown(
0509     struct xfs_mount *mp,
0510     uint32_t    flags,
0511     char        *fname,
0512     int     lnnum)
0513 {
0514     int     tag;
0515     const char  *why;
0516
0517
0518     if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
0519         xlog_shutdown_wait(mp->m_log);
0520         return;
0521     }
0522     if (mp->m_sb_bp)
0523         mp->m_sb_bp->b_flags |= XBF_DONE;
0524
0525     if (flags & SHUTDOWN_FORCE_UMOUNT)
0526         xfs_alert(mp, "User initiated shutdown received.");
0527
0528     if (xlog_force_shutdown(mp->m_log, flags)) {
0529         tag = XFS_PTAG_SHUTDOWN_LOGERROR;
0530         why = "Log I/O Error";
0531     } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
0532         tag = XFS_PTAG_SHUTDOWN_CORRUPT;
0533         why = "Corruption of in-memory data";
0534     } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
0535         tag = XFS_PTAG_SHUTDOWN_CORRUPT;
0536         why = "Corruption of on-disk metadata";
0537     } else {
0538         tag = XFS_PTAG_SHUTDOWN_IOERROR;
0539         why = "Metadata I/O Error";
0540     }
0541
0542     trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum);
0543
0544     xfs_alert_tag(mp, tag,
0545 "%s (0x%x) detected at %pS (%s:%d).  Shutting down filesystem.",
0546             why, flags, __return_address, fname, lnnum);
0547     xfs_alert(mp,
0548         "Please unmount the filesystem and rectify the problem(s)");
0549     if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
0550         xfs_stack_trace();
0551 }
0552
0553 /*
0554  * Reserve free space for per-AG metadata.
0555  */
0556 int
0557 xfs_fs_reserve_ag_blocks(
0558     struct xfs_mount    *mp)
0559 {
0560     xfs_agnumber_t      agno;
0561     struct xfs_perag    *pag;
0562     int         error = 0;
0563     int         err2;
0564
0565     mp->m_finobt_nores = false;
0566     for_each_perag(mp, agno, pag) {
0567         err2 = xfs_ag_resv_init(pag, NULL);
0568         if (err2 && !error)
0569             error = err2;
0570     }
0571
0572     if (error && error != -ENOSPC) {
0573         xfs_warn(mp,
0574     "Error %d reserving per-AG metadata reserve pool.", error);
0575         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
0576     }
0577
0578     return error;
0579 }
0580
0581 /*
0582  * Free space reserved for per-AG metadata.
0583  */
0584 int
0585 xfs_fs_unreserve_ag_blocks(
0586     struct xfs_mount    *mp)
0587 {
0588     xfs_agnumber_t      agno;
0589     struct xfs_perag    *pag;
0590     int         error = 0;
0591     int         err2;
0592
0593     for_each_perag(mp, agno, pag) {
0594         err2 = xfs_ag_resv_free(pag);
0595         if (err2 && !error)
0596             error = err2;
0597     }
0598
0599     if (error)
0600         xfs_warn(mp,
0601     "Error %d freeing per-AG metadata reserve pool.", error);
0602
0603     return error;
0604 }