Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * Copyright (C) 2014 Facebook.  All rights reserved.
0004  */
0005 
0006 #ifndef BTRFS_QGROUP_H
0007 #define BTRFS_QGROUP_H
0008 
0009 #include <linux/spinlock.h>
0010 #include <linux/rbtree.h>
0011 #include <linux/kobject.h>
0012 #include "ulist.h"
0013 #include "delayed-ref.h"
0014 
0015 /*
0016  * Btrfs qgroup overview
0017  *
0018  * Btrfs qgroup splits into 3 main part:
0019  * 1) Reserve
0020  *    Reserve metadata/data space for incoming operations
0021  *    Affect how qgroup limit works
0022  *
0023  * 2) Trace
0024  *    Tell btrfs qgroup to trace dirty extents.
0025  *
0026  *    Dirty extents including:
0027  *    - Newly allocated extents
0028  *    - Extents going to be deleted (in this trans)
0029  *    - Extents whose owner is going to be modified
0030  *
0031  *    This is the main part affects whether qgroup numbers will stay
0032  *    consistent.
0033  *    Btrfs qgroup can trace clean extents and won't cause any problem,
0034  *    but it will consume extra CPU time, it should be avoided if possible.
0035  *
0036  * 3) Account
0037  *    Btrfs qgroup will updates its numbers, based on dirty extents traced
0038  *    in previous step.
0039  *
0040  *    Normally at qgroup rescan and transaction commit time.
0041  */
0042 
0043 /*
0044  * Special performance optimization for balance.
0045  *
0046  * For balance, we need to swap subtree of subvolume and reloc trees.
0047  * In theory, we need to trace all subtree blocks of both subvolume and reloc
0048  * trees, since their owner has changed during such swap.
0049  *
0050  * However since balance has ensured that both subtrees are containing the
0051  * same contents and have the same tree structures, such swap won't cause
0052  * qgroup number change.
0053  *
0054  * But there is a race window between subtree swap and transaction commit,
0055  * during that window, if we increase/decrease tree level or merge/split tree
0056  * blocks, we still need to trace the original subtrees.
0057  *
0058  * So for balance, we use a delayed subtree tracing, whose workflow is:
0059  *
0060  * 1) Record the subtree root block get swapped.
0061  *
0062  *    During subtree swap:
0063  *    O = Old tree blocks
0064  *    N = New tree blocks
0065  *          reloc tree                     subvolume tree X
0066  *             Root                               Root
0067  *            /    \                             /    \
0068  *          NA     OB                          OA      OB
0069  *        /  |     |  \                      /  |      |  \
0070  *      NC  ND     OE  OF                   OC  OD     OE  OF
0071  *
0072  *   In this case, NA and OA are going to be swapped, record (NA, OA) into
0073  *   subvolume tree X.
0074  *
0075  * 2) After subtree swap.
0076  *          reloc tree                     subvolume tree X
0077  *             Root                               Root
0078  *            /    \                             /    \
0079  *          OA     OB                          NA      OB
0080  *        /  |     |  \                      /  |      |  \
0081  *      OC  OD     OE  OF                   NC  ND     OE  OF
0082  *
0083  * 3a) COW happens for OB
0084  *     If we are going to COW tree block OB, we check OB's bytenr against
0085  *     tree X's swapped_blocks structure.
0086  *     If it doesn't fit any, nothing will happen.
0087  *
0088  * 3b) COW happens for NA
0089  *     Check NA's bytenr against tree X's swapped_blocks, and get a hit.
0090  *     Then we do subtree scan on both subtrees OA and NA.
0091  *     Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
0092  *
0093  *     Then no matter what we do to subvolume tree X, qgroup numbers will
0094  *     still be correct.
0095  *     Then NA's record gets removed from X's swapped_blocks.
0096  *
0097  * 4)  Transaction commit
0098  *     Any record in X's swapped_blocks gets removed, since there is no
0099  *     modification to the swapped subtrees, no need to trigger heavy qgroup
0100  *     subtree rescan for them.
0101  */
0102 
0103 /*
0104  * Record a dirty extent, and info qgroup to update quota on it
0105  * TODO: Use kmem cache to alloc it.
0106  */
0107 struct btrfs_qgroup_extent_record {
0108     struct rb_node node;
0109     u64 bytenr;
0110     u64 num_bytes;
0111 
0112     /*
0113      * For qgroup reserved data space freeing.
0114      *
0115      * @data_rsv_refroot and @data_rsv will be recorded after
0116      * BTRFS_ADD_DELAYED_EXTENT is called.
0117      * And will be used to free reserved qgroup space at
0118      * transaction commit time.
0119      */
0120     u32 data_rsv;       /* reserved data space needs to be freed */
0121     u64 data_rsv_refroot;   /* which root the reserved data belongs to */
0122     struct ulist *old_roots;
0123 };
0124 
0125 struct btrfs_qgroup_swapped_block {
0126     struct rb_node node;
0127 
0128     int level;
0129     bool trace_leaf;
0130 
0131     /* bytenr/generation of the tree block in subvolume tree after swap */
0132     u64 subvol_bytenr;
0133     u64 subvol_generation;
0134 
0135     /* bytenr/generation of the tree block in reloc tree after swap */
0136     u64 reloc_bytenr;
0137     u64 reloc_generation;
0138 
0139     u64 last_snapshot;
0140     struct btrfs_key first_key;
0141 };
0142 
0143 /*
0144  * Qgroup reservation types:
0145  *
0146  * DATA:
0147  *  space reserved for data
0148  *
0149  * META_PERTRANS:
0150  *  Space reserved for metadata (per-transaction)
0151  *  Due to the fact that qgroup data is only updated at transaction commit
0152  *  time, reserved space for metadata must be kept until transaction
0153  *  commits.
0154  *  Any metadata reserved that are used in btrfs_start_transaction() should
0155  *  be of this type.
0156  *
0157  * META_PREALLOC:
0158  *  There are cases where metadata space is reserved before starting
0159  *  transaction, and then btrfs_join_transaction() to get a trans handle.
0160  *  Any metadata reserved for such usage should be of this type.
0161  *  And after join_transaction() part (or all) of such reservation should
0162  *  be converted into META_PERTRANS.
0163  */
0164 enum btrfs_qgroup_rsv_type {
0165     BTRFS_QGROUP_RSV_DATA,
0166     BTRFS_QGROUP_RSV_META_PERTRANS,
0167     BTRFS_QGROUP_RSV_META_PREALLOC,
0168     BTRFS_QGROUP_RSV_LAST,
0169 };
0170 
0171 /*
0172  * Represents how many bytes we have reserved for this qgroup.
0173  *
0174  * Each type should have different reservation behavior.
0175  * E.g, data follows its io_tree flag modification, while
0176  * *currently* meta is just reserve-and-clear during transaction.
0177  *
0178  * TODO: Add new type for reservation which can survive transaction commit.
0179  * Current metadata reservation behavior is not suitable for such case.
0180  */
0181 struct btrfs_qgroup_rsv {
0182     u64 values[BTRFS_QGROUP_RSV_LAST];
0183 };
0184 
0185 /*
0186  * one struct for each qgroup, organized in fs_info->qgroup_tree.
0187  */
0188 struct btrfs_qgroup {
0189     u64 qgroupid;
0190 
0191     /*
0192      * state
0193      */
0194     u64 rfer;   /* referenced */
0195     u64 rfer_cmpr;  /* referenced compressed */
0196     u64 excl;   /* exclusive */
0197     u64 excl_cmpr;  /* exclusive compressed */
0198 
0199     /*
0200      * limits
0201      */
0202     u64 lim_flags;  /* which limits are set */
0203     u64 max_rfer;
0204     u64 max_excl;
0205     u64 rsv_rfer;
0206     u64 rsv_excl;
0207 
0208     /*
0209      * reservation tracking
0210      */
0211     struct btrfs_qgroup_rsv rsv;
0212 
0213     /*
0214      * lists
0215      */
0216     struct list_head groups;  /* groups this group is member of */
0217     struct list_head members; /* groups that are members of this group */
0218     struct list_head dirty;   /* dirty groups */
0219     struct rb_node node;      /* tree of qgroups */
0220 
0221     /*
0222      * temp variables for accounting operations
0223      * Refer to qgroup_shared_accounting() for details.
0224      */
0225     u64 old_refcnt;
0226     u64 new_refcnt;
0227 
0228     /*
0229      * Sysfs kobjectid
0230      */
0231     struct kobject kobj;
0232 };
0233 
0234 static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
0235 {
0236     return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
0237 }
0238 
0239 /*
0240  * For qgroup event trace points only
0241  */
0242 #define QGROUP_RESERVE      (1<<0)
0243 #define QGROUP_RELEASE      (1<<1)
0244 #define QGROUP_FREE     (1<<2)
0245 
0246 int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
0247 int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
0248 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
0249 void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
0250 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
0251                      bool interruptible);
0252 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
0253                   u64 dst);
0254 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
0255                   u64 dst);
0256 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
0257 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
0258 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
0259                struct btrfs_qgroup_limit *limit);
0260 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
0261 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
0262 struct btrfs_delayed_extent_op;
0263 
0264 /*
0265  * Inform qgroup to trace one dirty extent, its info is recorded in @record.
0266  * So qgroup can account it at transaction committing time.
0267  *
0268  * No lock version, caller must acquire delayed ref lock and allocated memory,
0269  * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
0270  *
0271  * Return 0 for success insert
0272  * Return >0 for existing record, caller can free @record safely.
0273  * Error is not possible
0274  */
0275 int btrfs_qgroup_trace_extent_nolock(
0276         struct btrfs_fs_info *fs_info,
0277         struct btrfs_delayed_ref_root *delayed_refs,
0278         struct btrfs_qgroup_extent_record *record);
0279 
0280 /*
0281  * Post handler after qgroup_trace_extent_nolock().
0282  *
0283  * NOTE: Current qgroup does the expensive backref walk at transaction
0284  * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
0285  * new transaction.
0286  * This is designed to allow btrfs_find_all_roots() to get correct new_roots
0287  * result.
0288  *
0289  * However for old_roots there is no need to do backref walk at that time,
0290  * since we search commit roots to walk backref and result will always be
0291  * correct.
0292  *
0293  * Due to the nature of no lock version, we can't do backref there.
0294  * So we must call btrfs_qgroup_trace_extent_post() after exiting
0295  * spinlock context.
0296  *
0297  * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
0298  * using current root, then we can move all expensive backref walk out of
0299  * transaction committing, but not now as qgroup accounting will be wrong again.
0300  */
0301 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
0302                    struct btrfs_qgroup_extent_record *qrecord);
0303 
0304 /*
0305  * Inform qgroup to trace one dirty extent, specified by @bytenr and
0306  * @num_bytes.
0307  * So qgroup can account it at commit trans time.
0308  *
0309  * Better encapsulated version, with memory allocation and backref walk for
0310  * commit roots.
0311  * So this can sleep.
0312  *
0313  * Return 0 if the operation is done.
0314  * Return <0 for error, like memory allocation failure or invalid parameter
0315  * (NULL trans)
0316  */
0317 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
0318                   u64 num_bytes, gfp_t gfp_flag);
0319 
0320 /*
0321  * Inform qgroup to trace all leaf items of data
0322  *
0323  * Return 0 for success
0324  * Return <0 for error(ENOMEM)
0325  */
0326 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
0327                   struct extent_buffer *eb);
0328 /*
0329  * Inform qgroup to trace a whole subtree, including all its child tree
0330  * blocks and data.
0331  * The root tree block is specified by @root_eb.
0332  *
0333  * Normally used by relocation(tree block swap) and subvolume deletion.
0334  *
0335  * Return 0 for success
0336  * Return <0 for error(ENOMEM or tree search error)
0337  */
0338 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
0339                    struct extent_buffer *root_eb,
0340                    u64 root_gen, int root_level);
0341 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
0342                 u64 num_bytes, struct ulist *old_roots,
0343                 struct ulist *new_roots);
0344 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
0345 int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
0346 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
0347              u64 objectid, struct btrfs_qgroup_inherit *inherit);
0348 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
0349                    u64 ref_root, u64 num_bytes,
0350                    enum btrfs_qgroup_rsv_type type);
0351 
0352 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0353 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
0354                    u64 rfer, u64 excl);
0355 #endif
0356 
0357 /* New io_tree based accurate qgroup reserve API */
0358 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
0359             struct extent_changeset **reserved, u64 start, u64 len);
0360 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
0361 int btrfs_qgroup_free_data(struct btrfs_inode *inode,
0362                struct extent_changeset *reserved, u64 start,
0363                u64 len);
0364 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
0365                   enum btrfs_qgroup_rsv_type type, bool enforce);
0366 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
0367                 enum btrfs_qgroup_rsv_type type, bool enforce,
0368                 bool noflush);
0369 /* Reserve metadata space for pertrans and prealloc type */
0370 static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
0371                 int num_bytes, bool enforce)
0372 {
0373     return __btrfs_qgroup_reserve_meta(root, num_bytes,
0374                        BTRFS_QGROUP_RSV_META_PERTRANS,
0375                        enforce, false);
0376 }
0377 static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
0378                              int num_bytes, bool enforce,
0379                              bool noflush)
0380 {
0381     return __btrfs_qgroup_reserve_meta(root, num_bytes,
0382                        BTRFS_QGROUP_RSV_META_PREALLOC,
0383                        enforce, noflush);
0384 }
0385 
0386 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
0387                  enum btrfs_qgroup_rsv_type type);
0388 
0389 /* Free per-transaction meta reservation for error handling */
0390 static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
0391                            int num_bytes)
0392 {
0393     __btrfs_qgroup_free_meta(root, num_bytes,
0394             BTRFS_QGROUP_RSV_META_PERTRANS);
0395 }
0396 
0397 /* Pre-allocated meta reservation can be freed at need */
0398 static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
0399                            int num_bytes)
0400 {
0401     __btrfs_qgroup_free_meta(root, num_bytes,
0402             BTRFS_QGROUP_RSV_META_PREALLOC);
0403 }
0404 
0405 /*
0406  * Per-transaction meta reservation should be all freed at transaction commit
0407  * time
0408  */
0409 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
0410 
0411 /*
0412  * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
0413  *
0414  * This is called when preallocated meta reservation needs to be used.
0415  * Normally after btrfs_join_transaction() call.
0416  */
0417 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
0418 
0419 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
0420 
0421 /* btrfs_qgroup_swapped_blocks related functions */
0422 void btrfs_qgroup_init_swapped_blocks(
0423     struct btrfs_qgroup_swapped_blocks *swapped_blocks);
0424 
0425 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
0426 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
0427         struct btrfs_root *subvol_root,
0428         struct btrfs_block_group *bg,
0429         struct extent_buffer *subvol_parent, int subvol_slot,
0430         struct extent_buffer *reloc_parent, int reloc_slot,
0431         u64 last_snapshot);
0432 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
0433         struct btrfs_root *root, struct extent_buffer *eb);
0434 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
0435 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
0436 
0437 #endif