fs/btrfs/qgroup.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2011 STRATO.  All rights reserved.
0004  */
0005
0006 #include <linux/sched.h>
0007 #include <linux/pagemap.h>
0008 #include <linux/writeback.h>
0009 #include <linux/blkdev.h>
0010 #include <linux/rbtree.h>
0011 #include <linux/slab.h>
0012 #include <linux/workqueue.h>
0013 #include <linux/btrfs.h>
0014 #include <linux/sched/mm.h>
0015
0016 #include "ctree.h"
0017 #include "transaction.h"
0018 #include "disk-io.h"
0019 #include "locking.h"
0020 #include "ulist.h"
0021 #include "backref.h"
0022 #include "extent_io.h"
0023 #include "qgroup.h"
0024 #include "block-group.h"
0025 #include "sysfs.h"
0026 #include "tree-mod-log.h"
0027
0028 /*
0029  * Helpers to access qgroup reservation
0030  *
0031  * Callers should ensure the lock context and type are valid
0032  */
0033
0034 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
0035 {
0036     u64 ret = 0;
0037     int i;
0038
0039     for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
0040         ret += qgroup->rsv.values[i];
0041
0042     return ret;
0043 }
0044
0045 #ifdef CONFIG_BTRFS_DEBUG
0046 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
0047 {
0048     if (type == BTRFS_QGROUP_RSV_DATA)
0049         return "data";
0050     if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
0051         return "meta_pertrans";
0052     if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
0053         return "meta_prealloc";
0054     return NULL;
0055 }
0056 #endif
0057
0058 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
0059                struct btrfs_qgroup *qgroup, u64 num_bytes,
0060                enum btrfs_qgroup_rsv_type type)
0061 {
0062     trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
0063     qgroup->rsv.values[type] += num_bytes;
0064 }
0065
0066 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
0067                    struct btrfs_qgroup *qgroup, u64 num_bytes,
0068                    enum btrfs_qgroup_rsv_type type)
0069 {
0070     trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
0071     if (qgroup->rsv.values[type] >= num_bytes) {
0072         qgroup->rsv.values[type] -= num_bytes;
0073         return;
0074     }
0075 #ifdef CONFIG_BTRFS_DEBUG
0076     WARN_RATELIMIT(1,
0077         "qgroup %llu %s reserved space underflow, have %llu to free %llu",
0078         qgroup->qgroupid, qgroup_rsv_type_str(type),
0079         qgroup->rsv.values[type], num_bytes);
0080 #endif
0081     qgroup->rsv.values[type] = 0;
0082 }
0083
0084 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
0085                      struct btrfs_qgroup *dest,
0086                      struct btrfs_qgroup *src)
0087 {
0088     int i;
0089
0090     for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
0091         qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
0092 }
0093
0094 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
0095                      struct btrfs_qgroup *dest,
0096                       struct btrfs_qgroup *src)
0097 {
0098     int i;
0099
0100     for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
0101         qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
0102 }
0103
0104 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
0105                        int mod)
0106 {
0107     if (qg->old_refcnt < seq)
0108         qg->old_refcnt = seq;
0109     qg->old_refcnt += mod;
0110 }
0111
0112 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
0113                        int mod)
0114 {
0115     if (qg->new_refcnt < seq)
0116         qg->new_refcnt = seq;
0117     qg->new_refcnt += mod;
0118 }
0119
0120 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
0121 {
0122     if (qg->old_refcnt < seq)
0123         return 0;
0124     return qg->old_refcnt - seq;
0125 }
0126
0127 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
0128 {
0129     if (qg->new_refcnt < seq)
0130         return 0;
0131     return qg->new_refcnt - seq;
0132 }
0133
0134 /*
0135  * glue structure to represent the relations between qgroups.
0136  */
0137 struct btrfs_qgroup_list {
0138     struct list_head next_group;
0139     struct list_head next_member;
0140     struct btrfs_qgroup *group;
0141     struct btrfs_qgroup *member;
0142 };
0143
0144 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
0145 {
0146     return (u64)(uintptr_t)qg;
0147 }
0148
0149 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
0150 {
0151     return (struct btrfs_qgroup *)(uintptr_t)n->aux;
0152 }
0153
0154 static int
0155 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
0156            int init_flags);
0157 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
0158
0159 /* must be called with qgroup_ioctl_lock held */
0160 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
0161                        u64 qgroupid)
0162 {
0163     struct rb_node *n = fs_info->qgroup_tree.rb_node;
0164     struct btrfs_qgroup *qgroup;
0165
0166     while (n) {
0167         qgroup = rb_entry(n, struct btrfs_qgroup, node);
0168         if (qgroup->qgroupid < qgroupid)
0169             n = n->rb_left;
0170         else if (qgroup->qgroupid > qgroupid)
0171             n = n->rb_right;
0172         else
0173             return qgroup;
0174     }
0175     return NULL;
0176 }
0177
0178 /* must be called with qgroup_lock held */
0179 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
0180                       u64 qgroupid)
0181 {
0182     struct rb_node **p = &fs_info->qgroup_tree.rb_node;
0183     struct rb_node *parent = NULL;
0184     struct btrfs_qgroup *qgroup;
0185
0186     while (*p) {
0187         parent = *p;
0188         qgroup = rb_entry(parent, struct btrfs_qgroup, node);
0189
0190         if (qgroup->qgroupid < qgroupid)
0191             p = &(*p)->rb_left;
0192         else if (qgroup->qgroupid > qgroupid)
0193             p = &(*p)->rb_right;
0194         else
0195             return qgroup;
0196     }
0197
0198     qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
0199     if (!qgroup)
0200         return ERR_PTR(-ENOMEM);
0201
0202     qgroup->qgroupid = qgroupid;
0203     INIT_LIST_HEAD(&qgroup->groups);
0204     INIT_LIST_HEAD(&qgroup->members);
0205     INIT_LIST_HEAD(&qgroup->dirty);
0206
0207     rb_link_node(&qgroup->node, parent, p);
0208     rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
0209
0210     return qgroup;
0211 }
0212
0213 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
0214                 struct btrfs_qgroup *qgroup)
0215 {
0216     struct btrfs_qgroup_list *list;
0217
0218     list_del(&qgroup->dirty);
0219     while (!list_empty(&qgroup->groups)) {
0220         list = list_first_entry(&qgroup->groups,
0221                     struct btrfs_qgroup_list, next_group);
0222         list_del(&list->next_group);
0223         list_del(&list->next_member);
0224         kfree(list);
0225     }
0226
0227     while (!list_empty(&qgroup->members)) {
0228         list = list_first_entry(&qgroup->members,
0229                     struct btrfs_qgroup_list, next_member);
0230         list_del(&list->next_group);
0231         list_del(&list->next_member);
0232         kfree(list);
0233     }
0234 }
0235
0236 /* must be called with qgroup_lock held */
0237 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
0238 {
0239     struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
0240
0241     if (!qgroup)
0242         return -ENOENT;
0243
0244     rb_erase(&qgroup->node, &fs_info->qgroup_tree);
0245     __del_qgroup_rb(fs_info, qgroup);
0246     return 0;
0247 }
0248
0249 /*
0250  * Add relation specified by two qgroups.
0251  *
0252  * Must be called with qgroup_lock held.
0253  *
0254  * Return: 0        on success
0255  *         -ENOENT  if one of the qgroups is NULL
0256  *         <0       other errors
0257  */
0258 static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
0259 {
0260     struct btrfs_qgroup_list *list;
0261
0262     if (!member || !parent)
0263         return -ENOENT;
0264
0265     list = kzalloc(sizeof(*list), GFP_ATOMIC);
0266     if (!list)
0267         return -ENOMEM;
0268
0269     list->group = parent;
0270     list->member = member;
0271     list_add_tail(&list->next_group, &member->groups);
0272     list_add_tail(&list->next_member, &parent->members);
0273
0274     return 0;
0275 }
0276
0277 /*
0278  * Add relation specified by two qgoup ids.
0279  *
0280  * Must be called with qgroup_lock held.
0281  *
0282  * Return: 0        on success
0283  *         -ENOENT  if one of the ids does not exist
0284  *         <0       other errors
0285  */
0286 static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
0287 {
0288     struct btrfs_qgroup *member;
0289     struct btrfs_qgroup *parent;
0290
0291     member = find_qgroup_rb(fs_info, memberid);
0292     parent = find_qgroup_rb(fs_info, parentid);
0293
0294     return __add_relation_rb(member, parent);
0295 }
0296
0297 /* Must be called with qgroup_lock held */
0298 static int del_relation_rb(struct btrfs_fs_info *fs_info,
0299                u64 memberid, u64 parentid)
0300 {
0301     struct btrfs_qgroup *member;
0302     struct btrfs_qgroup *parent;
0303     struct btrfs_qgroup_list *list;
0304
0305     member = find_qgroup_rb(fs_info, memberid);
0306     parent = find_qgroup_rb(fs_info, parentid);
0307     if (!member || !parent)
0308         return -ENOENT;
0309
0310     list_for_each_entry(list, &member->groups, next_group) {
0311         if (list->group == parent) {
0312             list_del(&list->next_group);
0313             list_del(&list->next_member);
0314             kfree(list);
0315             return 0;
0316         }
0317     }
0318     return -ENOENT;
0319 }
0320
0321 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0322 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
0323                    u64 rfer, u64 excl)
0324 {
0325     struct btrfs_qgroup *qgroup;
0326
0327     qgroup = find_qgroup_rb(fs_info, qgroupid);
0328     if (!qgroup)
0329         return -EINVAL;
0330     if (qgroup->rfer != rfer || qgroup->excl != excl)
0331         return -EINVAL;
0332     return 0;
0333 }
0334 #endif
0335
0336 /*
0337  * The full config is read in one go, only called from open_ctree()
0338  * It doesn't use any locking, as at this point we're still single-threaded
0339  */
0340 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
0341 {
0342     struct btrfs_key key;
0343     struct btrfs_key found_key;
0344     struct btrfs_root *quota_root = fs_info->quota_root;
0345     struct btrfs_path *path = NULL;
0346     struct extent_buffer *l;
0347     int slot;
0348     int ret = 0;
0349     u64 flags = 0;
0350     u64 rescan_progress = 0;
0351
0352     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
0353         return 0;
0354
0355     fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
0356     if (!fs_info->qgroup_ulist) {
0357         ret = -ENOMEM;
0358         goto out;
0359     }
0360
0361     path = btrfs_alloc_path();
0362     if (!path) {
0363         ret = -ENOMEM;
0364         goto out;
0365     }
0366
0367     ret = btrfs_sysfs_add_qgroups(fs_info);
0368     if (ret < 0)
0369         goto out;
0370     /* default this to quota off, in case no status key is found */
0371     fs_info->qgroup_flags = 0;
0372
0373     /*
0374      * pass 1: read status, all qgroup infos and limits
0375      */
0376     key.objectid = 0;
0377     key.type = 0;
0378     key.offset = 0;
0379     ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
0380     if (ret)
0381         goto out;
0382
0383     while (1) {
0384         struct btrfs_qgroup *qgroup;
0385
0386         slot = path->slots[0];
0387         l = path->nodes[0];
0388         btrfs_item_key_to_cpu(l, &found_key, slot);
0389
0390         if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
0391             struct btrfs_qgroup_status_item *ptr;
0392
0393             ptr = btrfs_item_ptr(l, slot,
0394                          struct btrfs_qgroup_status_item);
0395
0396             if (btrfs_qgroup_status_version(l, ptr) !=
0397                 BTRFS_QGROUP_STATUS_VERSION) {
0398                 btrfs_err(fs_info,
0399                  "old qgroup version, quota disabled");
0400                 goto out;
0401             }
0402             if (btrfs_qgroup_status_generation(l, ptr) !=
0403                 fs_info->generation) {
0404                 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
0405                 btrfs_err(fs_info,
0406                     "qgroup generation mismatch, marked as inconsistent");
0407             }
0408             fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
0409                                       ptr);
0410             rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
0411             goto next1;
0412         }
0413
0414         if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
0415             found_key.type != BTRFS_QGROUP_LIMIT_KEY)
0416             goto next1;
0417
0418         qgroup = find_qgroup_rb(fs_info, found_key.offset);
0419         if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
0420             (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
0421             btrfs_err(fs_info, "inconsistent qgroup config");
0422             flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
0423         }
0424         if (!qgroup) {
0425             qgroup = add_qgroup_rb(fs_info, found_key.offset);
0426             if (IS_ERR(qgroup)) {
0427                 ret = PTR_ERR(qgroup);
0428                 goto out;
0429             }
0430         }
0431         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
0432         if (ret < 0)
0433             goto out;
0434
0435         switch (found_key.type) {
0436         case BTRFS_QGROUP_INFO_KEY: {
0437             struct btrfs_qgroup_info_item *ptr;
0438
0439             ptr = btrfs_item_ptr(l, slot,
0440                          struct btrfs_qgroup_info_item);
0441             qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
0442             qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
0443             qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
0444             qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
0445             /* generation currently unused */
0446             break;
0447         }
0448         case BTRFS_QGROUP_LIMIT_KEY: {
0449             struct btrfs_qgroup_limit_item *ptr;
0450
0451             ptr = btrfs_item_ptr(l, slot,
0452                          struct btrfs_qgroup_limit_item);
0453             qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
0454             qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
0455             qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
0456             qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
0457             qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
0458             break;
0459         }
0460         }
0461 next1:
0462         ret = btrfs_next_item(quota_root, path);
0463         if (ret < 0)
0464             goto out;
0465         if (ret)
0466             break;
0467     }
0468     btrfs_release_path(path);
0469
0470     /*
0471      * pass 2: read all qgroup relations
0472      */
0473     key.objectid = 0;
0474     key.type = BTRFS_QGROUP_RELATION_KEY;
0475     key.offset = 0;
0476     ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
0477     if (ret)
0478         goto out;
0479     while (1) {
0480         slot = path->slots[0];
0481         l = path->nodes[0];
0482         btrfs_item_key_to_cpu(l, &found_key, slot);
0483
0484         if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
0485             goto next2;
0486
0487         if (found_key.objectid > found_key.offset) {
0488             /* parent <- member, not needed to build config */
0489             /* FIXME should we omit the key completely? */
0490             goto next2;
0491         }
0492
0493         ret = add_relation_rb(fs_info, found_key.objectid,
0494                       found_key.offset);
0495         if (ret == -ENOENT) {
0496             btrfs_warn(fs_info,
0497                 "orphan qgroup relation 0x%llx->0x%llx",
0498                 found_key.objectid, found_key.offset);
0499             ret = 0;    /* ignore the error */
0500         }
0501         if (ret)
0502             goto out;
0503 next2:
0504         ret = btrfs_next_item(quota_root, path);
0505         if (ret < 0)
0506             goto out;
0507         if (ret)
0508             break;
0509     }
0510 out:
0511     btrfs_free_path(path);
0512     fs_info->qgroup_flags |= flags;
0513     if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
0514         clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
0515     else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
0516          ret >= 0)
0517         ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
0518
0519     if (ret < 0) {
0520         ulist_free(fs_info->qgroup_ulist);
0521         fs_info->qgroup_ulist = NULL;
0522         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
0523         btrfs_sysfs_del_qgroups(fs_info);
0524     }
0525
0526     return ret < 0 ? ret : 0;
0527 }
0528
0529 /*
0530  * Called in close_ctree() when quota is still enabled.  This verifies we don't
0531  * leak some reserved space.
0532  *
0533  * Return false if no reserved space is left.
0534  * Return true if some reserved space is leaked.
0535  */
0536 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
0537 {
0538     struct rb_node *node;
0539     bool ret = false;
0540
0541     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
0542         return ret;
0543     /*
0544      * Since we're unmounting, there is no race and no need to grab qgroup
0545      * lock.  And here we don't go post-order to provide a more user
0546      * friendly sorted result.
0547      */
0548     for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
0549         struct btrfs_qgroup *qgroup;
0550         int i;
0551
0552         qgroup = rb_entry(node, struct btrfs_qgroup, node);
0553         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
0554             if (qgroup->rsv.values[i]) {
0555                 ret = true;
0556                 btrfs_warn(fs_info,
0557         "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
0558                    btrfs_qgroup_level(qgroup->qgroupid),
0559                    btrfs_qgroup_subvolid(qgroup->qgroupid),
0560                    i, qgroup->rsv.values[i]);
0561             }
0562         }
0563     }
0564     return ret;
0565 }
0566
0567 /*
0568  * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
0569  * first two are in single-threaded paths.And for the third one, we have set
0570  * quota_root to be null with qgroup_lock held before, so it is safe to clean
0571  * up the in-memory structures without qgroup_lock held.
0572  */
0573 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
0574 {
0575     struct rb_node *n;
0576     struct btrfs_qgroup *qgroup;
0577
0578     while ((n = rb_first(&fs_info->qgroup_tree))) {
0579         qgroup = rb_entry(n, struct btrfs_qgroup, node);
0580         rb_erase(n, &fs_info->qgroup_tree);
0581         __del_qgroup_rb(fs_info, qgroup);
0582         btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
0583         kfree(qgroup);
0584     }
0585     /*
0586      * We call btrfs_free_qgroup_config() when unmounting
0587      * filesystem and disabling quota, so we set qgroup_ulist
0588      * to be null here to avoid double free.
0589      */
0590     ulist_free(fs_info->qgroup_ulist);
0591     fs_info->qgroup_ulist = NULL;
0592     btrfs_sysfs_del_qgroups(fs_info);
0593 }
0594
0595 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
0596                     u64 dst)
0597 {
0598     int ret;
0599     struct btrfs_root *quota_root = trans->fs_info->quota_root;
0600     struct btrfs_path *path;
0601     struct btrfs_key key;
0602
0603     path = btrfs_alloc_path();
0604     if (!path)
0605         return -ENOMEM;
0606
0607     key.objectid = src;
0608     key.type = BTRFS_QGROUP_RELATION_KEY;
0609     key.offset = dst;
0610
0611     ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
0612
0613     btrfs_mark_buffer_dirty(path->nodes[0]);
0614
0615     btrfs_free_path(path);
0616     return ret;
0617 }
0618
0619 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
0620                     u64 dst)
0621 {
0622     int ret;
0623     struct btrfs_root *quota_root = trans->fs_info->quota_root;
0624     struct btrfs_path *path;
0625     struct btrfs_key key;
0626
0627     path = btrfs_alloc_path();
0628     if (!path)
0629         return -ENOMEM;
0630
0631     key.objectid = src;
0632     key.type = BTRFS_QGROUP_RELATION_KEY;
0633     key.offset = dst;
0634
0635     ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
0636     if (ret < 0)
0637         goto out;
0638
0639     if (ret > 0) {
0640         ret = -ENOENT;
0641         goto out;
0642     }
0643
0644     ret = btrfs_del_item(trans, quota_root, path);
0645 out:
0646     btrfs_free_path(path);
0647     return ret;
0648 }
0649
0650 static int add_qgroup_item(struct btrfs_trans_handle *trans,
0651                struct btrfs_root *quota_root, u64 qgroupid)
0652 {
0653     int ret;
0654     struct btrfs_path *path;
0655     struct btrfs_qgroup_info_item *qgroup_info;
0656     struct btrfs_qgroup_limit_item *qgroup_limit;
0657     struct extent_buffer *leaf;
0658     struct btrfs_key key;
0659
0660     if (btrfs_is_testing(quota_root->fs_info))
0661         return 0;
0662
0663     path = btrfs_alloc_path();
0664     if (!path)
0665         return -ENOMEM;
0666
0667     key.objectid = 0;
0668     key.type = BTRFS_QGROUP_INFO_KEY;
0669     key.offset = qgroupid;
0670
0671     /*
0672      * Avoid a transaction abort by catching -EEXIST here. In that
0673      * case, we proceed by re-initializing the existing structure
0674      * on disk.
0675      */
0676
0677     ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
0678                       sizeof(*qgroup_info));
0679     if (ret && ret != -EEXIST)
0680         goto out;
0681
0682     leaf = path->nodes[0];
0683     qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
0684                  struct btrfs_qgroup_info_item);
0685     btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
0686     btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
0687     btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
0688     btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
0689     btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
0690
0691     btrfs_mark_buffer_dirty(leaf);
0692
0693     btrfs_release_path(path);
0694
0695     key.type = BTRFS_QGROUP_LIMIT_KEY;
0696     ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
0697                       sizeof(*qgroup_limit));
0698     if (ret && ret != -EEXIST)
0699         goto out;
0700
0701     leaf = path->nodes[0];
0702     qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
0703                   struct btrfs_qgroup_limit_item);
0704     btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
0705     btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
0706     btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
0707     btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
0708     btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
0709
0710     btrfs_mark_buffer_dirty(leaf);
0711
0712     ret = 0;
0713 out:
0714     btrfs_free_path(path);
0715     return ret;
0716 }
0717
0718 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
0719 {
0720     int ret;
0721     struct btrfs_root *quota_root = trans->fs_info->quota_root;
0722     struct btrfs_path *path;
0723     struct btrfs_key key;
0724
0725     path = btrfs_alloc_path();
0726     if (!path)
0727         return -ENOMEM;
0728
0729     key.objectid = 0;
0730     key.type = BTRFS_QGROUP_INFO_KEY;
0731     key.offset = qgroupid;
0732     ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
0733     if (ret < 0)
0734         goto out;
0735
0736     if (ret > 0) {
0737         ret = -ENOENT;
0738         goto out;
0739     }
0740
0741     ret = btrfs_del_item(trans, quota_root, path);
0742     if (ret)
0743         goto out;
0744
0745     btrfs_release_path(path);
0746
0747     key.type = BTRFS_QGROUP_LIMIT_KEY;
0748     ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
0749     if (ret < 0)
0750         goto out;
0751
0752     if (ret > 0) {
0753         ret = -ENOENT;
0754         goto out;
0755     }
0756
0757     ret = btrfs_del_item(trans, quota_root, path);
0758
0759 out:
0760     btrfs_free_path(path);
0761     return ret;
0762 }
0763
0764 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
0765                     struct btrfs_qgroup *qgroup)
0766 {
0767     struct btrfs_root *quota_root = trans->fs_info->quota_root;
0768     struct btrfs_path *path;
0769     struct btrfs_key key;
0770     struct extent_buffer *l;
0771     struct btrfs_qgroup_limit_item *qgroup_limit;
0772     int ret;
0773     int slot;
0774
0775     key.objectid = 0;
0776     key.type = BTRFS_QGROUP_LIMIT_KEY;
0777     key.offset = qgroup->qgroupid;
0778
0779     path = btrfs_alloc_path();
0780     if (!path)
0781         return -ENOMEM;
0782
0783     ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
0784     if (ret > 0)
0785         ret = -ENOENT;
0786
0787     if (ret)
0788         goto out;
0789
0790     l = path->nodes[0];
0791     slot = path->slots[0];
0792     qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
0793     btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
0794     btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
0795     btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
0796     btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
0797     btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
0798
0799     btrfs_mark_buffer_dirty(l);
0800
0801 out:
0802     btrfs_free_path(path);
0803     return ret;
0804 }
0805
0806 static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
0807                    struct btrfs_qgroup *qgroup)
0808 {
0809     struct btrfs_fs_info *fs_info = trans->fs_info;
0810     struct btrfs_root *quota_root = fs_info->quota_root;
0811     struct btrfs_path *path;
0812     struct btrfs_key key;
0813     struct extent_buffer *l;
0814     struct btrfs_qgroup_info_item *qgroup_info;
0815     int ret;
0816     int slot;
0817
0818     if (btrfs_is_testing(fs_info))
0819         return 0;
0820
0821     key.objectid = 0;
0822     key.type = BTRFS_QGROUP_INFO_KEY;
0823     key.offset = qgroup->qgroupid;
0824
0825     path = btrfs_alloc_path();
0826     if (!path)
0827         return -ENOMEM;
0828
0829     ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
0830     if (ret > 0)
0831         ret = -ENOENT;
0832
0833     if (ret)
0834         goto out;
0835
0836     l = path->nodes[0];
0837     slot = path->slots[0];
0838     qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
0839     btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
0840     btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
0841     btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
0842     btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
0843     btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
0844
0845     btrfs_mark_buffer_dirty(l);
0846
0847 out:
0848     btrfs_free_path(path);
0849     return ret;
0850 }
0851
0852 static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
0853 {
0854     struct btrfs_fs_info *fs_info = trans->fs_info;
0855     struct btrfs_root *quota_root = fs_info->quota_root;
0856     struct btrfs_path *path;
0857     struct btrfs_key key;
0858     struct extent_buffer *l;
0859     struct btrfs_qgroup_status_item *ptr;
0860     int ret;
0861     int slot;
0862
0863     key.objectid = 0;
0864     key.type = BTRFS_QGROUP_STATUS_KEY;
0865     key.offset = 0;
0866
0867     path = btrfs_alloc_path();
0868     if (!path)
0869         return -ENOMEM;
0870
0871     ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
0872     if (ret > 0)
0873         ret = -ENOENT;
0874
0875     if (ret)
0876         goto out;
0877
0878     l = path->nodes[0];
0879     slot = path->slots[0];
0880     ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
0881     btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
0882     btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
0883     btrfs_set_qgroup_status_rescan(l, ptr,
0884                 fs_info->qgroup_rescan_progress.objectid);
0885
0886     btrfs_mark_buffer_dirty(l);
0887
0888 out:
0889     btrfs_free_path(path);
0890     return ret;
0891 }
0892
0893 /*
0894  * called with qgroup_lock held
0895  */
0896 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
0897                   struct btrfs_root *root)
0898 {
0899     struct btrfs_path *path;
0900     struct btrfs_key key;
0901     struct extent_buffer *leaf = NULL;
0902     int ret;
0903     int nr = 0;
0904
0905     path = btrfs_alloc_path();
0906     if (!path)
0907         return -ENOMEM;
0908
0909     key.objectid = 0;
0910     key.offset = 0;
0911     key.type = 0;
0912
0913     while (1) {
0914         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
0915         if (ret < 0)
0916             goto out;
0917         leaf = path->nodes[0];
0918         nr = btrfs_header_nritems(leaf);
0919         if (!nr)
0920             break;
0921         /*
0922          * delete the leaf one by one
0923          * since the whole tree is going
0924          * to be deleted.
0925          */
0926         path->slots[0] = 0;
0927         ret = btrfs_del_items(trans, root, path, 0, nr);
0928         if (ret)
0929             goto out;
0930
0931         btrfs_release_path(path);
0932     }
0933     ret = 0;
0934 out:
0935     btrfs_free_path(path);
0936     return ret;
0937 }
0938
0939 int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
0940 {
0941     struct btrfs_root *quota_root;
0942     struct btrfs_root *tree_root = fs_info->tree_root;
0943     struct btrfs_path *path = NULL;
0944     struct btrfs_qgroup_status_item *ptr;
0945     struct extent_buffer *leaf;
0946     struct btrfs_key key;
0947     struct btrfs_key found_key;
0948     struct btrfs_qgroup *qgroup = NULL;
0949     struct btrfs_trans_handle *trans = NULL;
0950     struct ulist *ulist = NULL;
0951     int ret = 0;
0952     int slot;
0953
0954     /*
0955      * We need to have subvol_sem write locked, to prevent races between
0956      * concurrent tasks trying to enable quotas, because we will unlock
0957      * and relock qgroup_ioctl_lock before setting fs_info->quota_root
0958      * and before setting BTRFS_FS_QUOTA_ENABLED.
0959      */
0960     lockdep_assert_held_write(&fs_info->subvol_sem);
0961
0962     if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
0963         btrfs_err(fs_info,
0964               "qgroups are currently unsupported in extent tree v2");
0965         return -EINVAL;
0966     }
0967
0968     mutex_lock(&fs_info->qgroup_ioctl_lock);
0969     if (fs_info->quota_root)
0970         goto out;
0971
0972     ulist = ulist_alloc(GFP_KERNEL);
0973     if (!ulist) {
0974         ret = -ENOMEM;
0975         goto out;
0976     }
0977
0978     ret = btrfs_sysfs_add_qgroups(fs_info);
0979     if (ret < 0)
0980         goto out;
0981
0982     /*
0983      * Unlock qgroup_ioctl_lock before starting the transaction. This is to
0984      * avoid lock acquisition inversion problems (reported by lockdep) between
0985      * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
0986      * start a transaction.
0987      * After we started the transaction lock qgroup_ioctl_lock again and
0988      * check if someone else created the quota root in the meanwhile. If so,
0989      * just return success and release the transaction handle.
0990      *
0991      * Also we don't need to worry about someone else calling
0992      * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
0993      * that function returns 0 (success) when the sysfs entries already exist.
0994      */
0995     mutex_unlock(&fs_info->qgroup_ioctl_lock);
0996
0997     /*
0998      * 1 for quota root item
0999      * 1 for BTRFS_QGROUP_STATUS item
1000      *
1001      * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
1002      * per subvolume. However those are not currently reserved since it
1003      * would be a lot of overkill.
1004      */
1005     trans = btrfs_start_transaction(tree_root, 2);
1006
1007     mutex_lock(&fs_info->qgroup_ioctl_lock);
1008     if (IS_ERR(trans)) {
1009         ret = PTR_ERR(trans);
1010         trans = NULL;
1011         goto out;
1012     }
1013
1014     if (fs_info->quota_root)
1015         goto out;
1016
1017     fs_info->qgroup_ulist = ulist;
1018     ulist = NULL;
1019
1020     /*
1021      * initially create the quota tree
1022      */
1023     quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1024     if (IS_ERR(quota_root)) {
1025         ret =  PTR_ERR(quota_root);
1026         btrfs_abort_transaction(trans, ret);
1027         goto out;
1028     }
1029
1030     path = btrfs_alloc_path();
1031     if (!path) {
1032         ret = -ENOMEM;
1033         btrfs_abort_transaction(trans, ret);
1034         goto out_free_root;
1035     }
1036
1037     key.objectid = 0;
1038     key.type = BTRFS_QGROUP_STATUS_KEY;
1039     key.offset = 0;
1040
1041     ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1042                       sizeof(*ptr));
1043     if (ret) {
1044         btrfs_abort_transaction(trans, ret);
1045         goto out_free_path;
1046     }
1047
1048     leaf = path->nodes[0];
1049     ptr = btrfs_item_ptr(leaf, path->slots[0],
1050                  struct btrfs_qgroup_status_item);
1051     btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1052     btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1053     fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
1054                 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1055     btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
1056     btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1057
1058     btrfs_mark_buffer_dirty(leaf);
1059
1060     key.objectid = 0;
1061     key.type = BTRFS_ROOT_REF_KEY;
1062     key.offset = 0;
1063
1064     btrfs_release_path(path);
1065     ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1066     if (ret > 0)
1067         goto out_add_root;
1068     if (ret < 0) {
1069         btrfs_abort_transaction(trans, ret);
1070         goto out_free_path;
1071     }
1072
1073     while (1) {
1074         slot = path->slots[0];
1075         leaf = path->nodes[0];
1076         btrfs_item_key_to_cpu(leaf, &found_key, slot);
1077
1078         if (found_key.type == BTRFS_ROOT_REF_KEY) {
1079
1080             /* Release locks on tree_root before we access quota_root */
1081             btrfs_release_path(path);
1082
1083             ret = add_qgroup_item(trans, quota_root,
1084                           found_key.offset);
1085             if (ret) {
1086                 btrfs_abort_transaction(trans, ret);
1087                 goto out_free_path;
1088             }
1089
1090             qgroup = add_qgroup_rb(fs_info, found_key.offset);
1091             if (IS_ERR(qgroup)) {
1092                 ret = PTR_ERR(qgroup);
1093                 btrfs_abort_transaction(trans, ret);
1094                 goto out_free_path;
1095             }
1096             ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1097             if (ret < 0) {
1098                 btrfs_abort_transaction(trans, ret);
1099                 goto out_free_path;
1100             }
1101             ret = btrfs_search_slot_for_read(tree_root, &found_key,
1102                              path, 1, 0);
1103             if (ret < 0) {
1104                 btrfs_abort_transaction(trans, ret);
1105                 goto out_free_path;
1106             }
1107             if (ret > 0) {
1108                 /*
1109                  * Shouldn't happen, but in case it does we
1110                  * don't need to do the btrfs_next_item, just
1111                  * continue.
1112                  */
1113                 continue;
1114             }
1115         }
1116         ret = btrfs_next_item(tree_root, path);
1117         if (ret < 0) {
1118             btrfs_abort_transaction(trans, ret);
1119             goto out_free_path;
1120         }
1121         if (ret)
1122             break;
1123     }
1124
1125 out_add_root:
1126     btrfs_release_path(path);
1127     ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1128     if (ret) {
1129         btrfs_abort_transaction(trans, ret);
1130         goto out_free_path;
1131     }
1132
1133     qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
1134     if (IS_ERR(qgroup)) {
1135         ret = PTR_ERR(qgroup);
1136         btrfs_abort_transaction(trans, ret);
1137         goto out_free_path;
1138     }
1139     ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1140     if (ret < 0) {
1141         btrfs_abort_transaction(trans, ret);
1142         goto out_free_path;
1143     }
1144
1145     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1146     /*
1147      * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1148      * a deadlock with tasks concurrently doing other qgroup operations, such
1149      * adding/removing qgroups or adding/deleting qgroup relations for example,
1150      * because all qgroup operations first start or join a transaction and then
1151      * lock the qgroup_ioctl_lock mutex.
1152      * We are safe from a concurrent task trying to enable quotas, by calling
1153      * this function, since we are serialized by fs_info->subvol_sem.
1154      */
1155     ret = btrfs_commit_transaction(trans);
1156     trans = NULL;
1157     mutex_lock(&fs_info->qgroup_ioctl_lock);
1158     if (ret)
1159         goto out_free_path;
1160
1161     /*
1162      * Set quota enabled flag after committing the transaction, to avoid
1163      * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1164      * creation.
1165      */
1166     spin_lock(&fs_info->qgroup_lock);
1167     fs_info->quota_root = quota_root;
1168     set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1169     spin_unlock(&fs_info->qgroup_lock);
1170
1171     ret = qgroup_rescan_init(fs_info, 0, 1);
1172     if (!ret) {
1173             qgroup_rescan_zero_tracking(fs_info);
1174         fs_info->qgroup_rescan_running = true;
1175             btrfs_queue_work(fs_info->qgroup_rescan_workers,
1176                              &fs_info->qgroup_rescan_work);
1177     }
1178
1179 out_free_path:
1180     btrfs_free_path(path);
1181 out_free_root:
1182     if (ret)
1183         btrfs_put_root(quota_root);
1184 out:
1185     if (ret) {
1186         ulist_free(fs_info->qgroup_ulist);
1187         fs_info->qgroup_ulist = NULL;
1188         btrfs_sysfs_del_qgroups(fs_info);
1189     }
1190     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1191     if (ret && trans)
1192         btrfs_end_transaction(trans);
1193     else if (trans)
1194         ret = btrfs_end_transaction(trans);
1195     ulist_free(ulist);
1196     return ret;
1197 }
1198
1199 int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1200 {
1201     struct btrfs_root *quota_root;
1202     struct btrfs_trans_handle *trans = NULL;
1203     int ret = 0;
1204
1205     /*
1206      * We need to have subvol_sem write locked, to prevent races between
1207      * concurrent tasks trying to disable quotas, because we will unlock
1208      * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
1209      */
1210     lockdep_assert_held_write(&fs_info->subvol_sem);
1211
1212     mutex_lock(&fs_info->qgroup_ioctl_lock);
1213     if (!fs_info->quota_root)
1214         goto out;
1215
1216     /*
1217      * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1218      * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1219      * to lock that mutex while holding a transaction handle and the rescan
1220      * worker needs to commit a transaction.
1221      */
1222     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1223
1224     /*
1225      * Request qgroup rescan worker to complete and wait for it. This wait
1226      * must be done before transaction start for quota disable since it may
1227      * deadlock with transaction by the qgroup rescan worker.
1228      */
1229     clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1230     btrfs_qgroup_wait_for_completion(fs_info, false);
1231
1232     /*
1233      * 1 For the root item
1234      *
1235      * We should also reserve enough items for the quota tree deletion in
1236      * btrfs_clean_quota_tree but this is not done.
1237      *
1238      * Also, we must always start a transaction without holding the mutex
1239      * qgroup_ioctl_lock, see btrfs_quota_enable().
1240      */
1241     trans = btrfs_start_transaction(fs_info->tree_root, 1);
1242
1243     mutex_lock(&fs_info->qgroup_ioctl_lock);
1244     if (IS_ERR(trans)) {
1245         ret = PTR_ERR(trans);
1246         trans = NULL;
1247         set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1248         goto out;
1249     }
1250
1251     if (!fs_info->quota_root)
1252         goto out;
1253
1254     spin_lock(&fs_info->qgroup_lock);
1255     quota_root = fs_info->quota_root;
1256     fs_info->quota_root = NULL;
1257     fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1258     spin_unlock(&fs_info->qgroup_lock);
1259
1260     btrfs_free_qgroup_config(fs_info);
1261
1262     ret = btrfs_clean_quota_tree(trans, quota_root);
1263     if (ret) {
1264         btrfs_abort_transaction(trans, ret);
1265         goto out;
1266     }
1267
1268     ret = btrfs_del_root(trans, &quota_root->root_key);
1269     if (ret) {
1270         btrfs_abort_transaction(trans, ret);
1271         goto out;
1272     }
1273
1274     list_del(&quota_root->dirty_list);
1275
1276     btrfs_tree_lock(quota_root->node);
1277     btrfs_clean_tree_block(quota_root->node);
1278     btrfs_tree_unlock(quota_root->node);
1279     btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
1280                   quota_root->node, 0, 1);
1281
1282     btrfs_put_root(quota_root);
1283
1284 out:
1285     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1286     if (ret && trans)
1287         btrfs_end_transaction(trans);
1288     else if (trans)
1289         ret = btrfs_end_transaction(trans);
1290
1291     return ret;
1292 }
1293
1294 static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1295              struct btrfs_qgroup *qgroup)
1296 {
1297     if (list_empty(&qgroup->dirty))
1298         list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1299 }
1300
1301 /*
1302  * The easy accounting, we're updating qgroup relationship whose child qgroup
1303  * only has exclusive extents.
1304  *
1305  * In this case, all exclusive extents will also be exclusive for parent, so
1306  * excl/rfer just get added/removed.
1307  *
1308  * So is qgroup reservation space, which should also be added/removed to
1309  * parent.
1310  * Or when child tries to release reservation space, parent will underflow its
1311  * reservation (for relationship adding case).
1312  *
1313  * Caller should hold fs_info->qgroup_lock.
1314  */
1315 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1316                     struct ulist *tmp, u64 ref_root,
1317                     struct btrfs_qgroup *src, int sign)
1318 {
1319     struct btrfs_qgroup *qgroup;
1320     struct btrfs_qgroup_list *glist;
1321     struct ulist_node *unode;
1322     struct ulist_iterator uiter;
1323     u64 num_bytes = src->excl;
1324     int ret = 0;
1325
1326     qgroup = find_qgroup_rb(fs_info, ref_root);
1327     if (!qgroup)
1328         goto out;
1329
1330     qgroup->rfer += sign * num_bytes;
1331     qgroup->rfer_cmpr += sign * num_bytes;
1332
1333     WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1334     qgroup->excl += sign * num_bytes;
1335     qgroup->excl_cmpr += sign * num_bytes;
1336
1337     if (sign > 0)
1338         qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1339     else
1340         qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1341
1342     qgroup_dirty(fs_info, qgroup);
1343
1344     /* Get all of the parent groups that contain this qgroup */
1345     list_for_each_entry(glist, &qgroup->groups, next_group) {
1346         ret = ulist_add(tmp, glist->group->qgroupid,
1347                 qgroup_to_aux(glist->group), GFP_ATOMIC);
1348         if (ret < 0)
1349             goto out;
1350     }
1351
1352     /* Iterate all of the parents and adjust their reference counts */
1353     ULIST_ITER_INIT(&uiter);
1354     while ((unode = ulist_next(tmp, &uiter))) {
1355         qgroup = unode_aux_to_qgroup(unode);
1356         qgroup->rfer += sign * num_bytes;
1357         qgroup->rfer_cmpr += sign * num_bytes;
1358         WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1359         qgroup->excl += sign * num_bytes;
1360         if (sign > 0)
1361             qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1362         else
1363             qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1364         qgroup->excl_cmpr += sign * num_bytes;
1365         qgroup_dirty(fs_info, qgroup);
1366
1367         /* Add any parents of the parents */
1368         list_for_each_entry(glist, &qgroup->groups, next_group) {
1369             ret = ulist_add(tmp, glist->group->qgroupid,
1370                     qgroup_to_aux(glist->group), GFP_ATOMIC);
1371             if (ret < 0)
1372                 goto out;
1373         }
1374     }
1375     ret = 0;
1376 out:
1377     return ret;
1378 }
1379
1380
1381 /*
1382  * Quick path for updating qgroup with only excl refs.
1383  *
1384  * In that case, just update all parent will be enough.
1385  * Or we needs to do a full rescan.
1386  * Caller should also hold fs_info->qgroup_lock.
1387  *
1388  * Return 0 for quick update, return >0 for need to full rescan
1389  * and mark INCONSISTENT flag.
1390  * Return < 0 for other error.
1391  */
1392 static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1393                    struct ulist *tmp, u64 src, u64 dst,
1394                    int sign)
1395 {
1396     struct btrfs_qgroup *qgroup;
1397     int ret = 1;
1398     int err = 0;
1399
1400     qgroup = find_qgroup_rb(fs_info, src);
1401     if (!qgroup)
1402         goto out;
1403     if (qgroup->excl == qgroup->rfer) {
1404         ret = 0;
1405         err = __qgroup_excl_accounting(fs_info, tmp, dst,
1406                            qgroup, sign);
1407         if (err < 0) {
1408             ret = err;
1409             goto out;
1410         }
1411     }
1412 out:
1413     if (ret)
1414         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1415     return ret;
1416 }
1417
1418 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1419                   u64 dst)
1420 {
1421     struct btrfs_fs_info *fs_info = trans->fs_info;
1422     struct btrfs_qgroup *parent;
1423     struct btrfs_qgroup *member;
1424     struct btrfs_qgroup_list *list;
1425     struct ulist *tmp;
1426     unsigned int nofs_flag;
1427     int ret = 0;
1428
1429     /* Check the level of src and dst first */
1430     if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1431         return -EINVAL;
1432
1433     /* We hold a transaction handle open, must do a NOFS allocation. */
1434     nofs_flag = memalloc_nofs_save();
1435     tmp = ulist_alloc(GFP_KERNEL);
1436     memalloc_nofs_restore(nofs_flag);
1437     if (!tmp)
1438         return -ENOMEM;
1439
1440     mutex_lock(&fs_info->qgroup_ioctl_lock);
1441     if (!fs_info->quota_root) {
1442         ret = -ENOTCONN;
1443         goto out;
1444     }
1445     member = find_qgroup_rb(fs_info, src);
1446     parent = find_qgroup_rb(fs_info, dst);
1447     if (!member || !parent) {
1448         ret = -EINVAL;
1449         goto out;
1450     }
1451
1452     /* check if such qgroup relation exist firstly */
1453     list_for_each_entry(list, &member->groups, next_group) {
1454         if (list->group == parent) {
1455             ret = -EEXIST;
1456             goto out;
1457         }
1458     }
1459
1460     ret = add_qgroup_relation_item(trans, src, dst);
1461     if (ret)
1462         goto out;
1463
1464     ret = add_qgroup_relation_item(trans, dst, src);
1465     if (ret) {
1466         del_qgroup_relation_item(trans, src, dst);
1467         goto out;
1468     }
1469
1470     spin_lock(&fs_info->qgroup_lock);
1471     ret = __add_relation_rb(member, parent);
1472     if (ret < 0) {
1473         spin_unlock(&fs_info->qgroup_lock);
1474         goto out;
1475     }
1476     ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1477     spin_unlock(&fs_info->qgroup_lock);
1478 out:
1479     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1480     ulist_free(tmp);
1481     return ret;
1482 }
1483
1484 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1485                  u64 dst)
1486 {
1487     struct btrfs_fs_info *fs_info = trans->fs_info;
1488     struct btrfs_qgroup *parent;
1489     struct btrfs_qgroup *member;
1490     struct btrfs_qgroup_list *list;
1491     struct ulist *tmp;
1492     bool found = false;
1493     unsigned int nofs_flag;
1494     int ret = 0;
1495     int ret2;
1496
1497     /* We hold a transaction handle open, must do a NOFS allocation. */
1498     nofs_flag = memalloc_nofs_save();
1499     tmp = ulist_alloc(GFP_KERNEL);
1500     memalloc_nofs_restore(nofs_flag);
1501     if (!tmp)
1502         return -ENOMEM;
1503
1504     if (!fs_info->quota_root) {
1505         ret = -ENOTCONN;
1506         goto out;
1507     }
1508
1509     member = find_qgroup_rb(fs_info, src);
1510     parent = find_qgroup_rb(fs_info, dst);
1511     /*
1512      * The parent/member pair doesn't exist, then try to delete the dead
1513      * relation items only.
1514      */
1515     if (!member || !parent)
1516         goto delete_item;
1517
1518     /* check if such qgroup relation exist firstly */
1519     list_for_each_entry(list, &member->groups, next_group) {
1520         if (list->group == parent) {
1521             found = true;
1522             break;
1523         }
1524     }
1525
1526 delete_item:
1527     ret = del_qgroup_relation_item(trans, src, dst);
1528     if (ret < 0 && ret != -ENOENT)
1529         goto out;
1530     ret2 = del_qgroup_relation_item(trans, dst, src);
1531     if (ret2 < 0 && ret2 != -ENOENT)
1532         goto out;
1533
1534     /* At least one deletion succeeded, return 0 */
1535     if (!ret || !ret2)
1536         ret = 0;
1537
1538     if (found) {
1539         spin_lock(&fs_info->qgroup_lock);
1540         del_relation_rb(fs_info, src, dst);
1541         ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1542         spin_unlock(&fs_info->qgroup_lock);
1543     }
1544 out:
1545     ulist_free(tmp);
1546     return ret;
1547 }
1548
1549 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1550                   u64 dst)
1551 {
1552     struct btrfs_fs_info *fs_info = trans->fs_info;
1553     int ret = 0;
1554
1555     mutex_lock(&fs_info->qgroup_ioctl_lock);
1556     ret = __del_qgroup_relation(trans, src, dst);
1557     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1558
1559     return ret;
1560 }
1561
1562 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1563 {
1564     struct btrfs_fs_info *fs_info = trans->fs_info;
1565     struct btrfs_root *quota_root;
1566     struct btrfs_qgroup *qgroup;
1567     int ret = 0;
1568
1569     mutex_lock(&fs_info->qgroup_ioctl_lock);
1570     if (!fs_info->quota_root) {
1571         ret = -ENOTCONN;
1572         goto out;
1573     }
1574     quota_root = fs_info->quota_root;
1575     qgroup = find_qgroup_rb(fs_info, qgroupid);
1576     if (qgroup) {
1577         ret = -EEXIST;
1578         goto out;
1579     }
1580
1581     ret = add_qgroup_item(trans, quota_root, qgroupid);
1582     if (ret)
1583         goto out;
1584
1585     spin_lock(&fs_info->qgroup_lock);
1586     qgroup = add_qgroup_rb(fs_info, qgroupid);
1587     spin_unlock(&fs_info->qgroup_lock);
1588
1589     if (IS_ERR(qgroup)) {
1590         ret = PTR_ERR(qgroup);
1591         goto out;
1592     }
1593     ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1594 out:
1595     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1596     return ret;
1597 }
1598
1599 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1600 {
1601     struct btrfs_fs_info *fs_info = trans->fs_info;
1602     struct btrfs_qgroup *qgroup;
1603     struct btrfs_qgroup_list *list;
1604     int ret = 0;
1605
1606     mutex_lock(&fs_info->qgroup_ioctl_lock);
1607     if (!fs_info->quota_root) {
1608         ret = -ENOTCONN;
1609         goto out;
1610     }
1611
1612     qgroup = find_qgroup_rb(fs_info, qgroupid);
1613     if (!qgroup) {
1614         ret = -ENOENT;
1615         goto out;
1616     }
1617
1618     /* Check if there are no children of this qgroup */
1619     if (!list_empty(&qgroup->members)) {
1620         ret = -EBUSY;
1621         goto out;
1622     }
1623
1624     ret = del_qgroup_item(trans, qgroupid);
1625     if (ret && ret != -ENOENT)
1626         goto out;
1627
1628     while (!list_empty(&qgroup->groups)) {
1629         list = list_first_entry(&qgroup->groups,
1630                     struct btrfs_qgroup_list, next_group);
1631         ret = __del_qgroup_relation(trans, qgroupid,
1632                         list->group->qgroupid);
1633         if (ret)
1634             goto out;
1635     }
1636
1637     spin_lock(&fs_info->qgroup_lock);
1638     del_qgroup_rb(fs_info, qgroupid);
1639     spin_unlock(&fs_info->qgroup_lock);
1640
1641     /*
1642      * Remove the qgroup from sysfs now without holding the qgroup_lock
1643      * spinlock, since the sysfs_remove_group() function needs to take
1644      * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1645      */
1646     btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1647     kfree(qgroup);
1648 out:
1649     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1650     return ret;
1651 }
1652
1653 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1654                struct btrfs_qgroup_limit *limit)
1655 {
1656     struct btrfs_fs_info *fs_info = trans->fs_info;
1657     struct btrfs_qgroup *qgroup;
1658     int ret = 0;
1659     /* Sometimes we would want to clear the limit on this qgroup.
1660      * To meet this requirement, we treat the -1 as a special value
1661      * which tell kernel to clear the limit on this qgroup.
1662      */
1663     const u64 CLEAR_VALUE = -1;
1664
1665     mutex_lock(&fs_info->qgroup_ioctl_lock);
1666     if (!fs_info->quota_root) {
1667         ret = -ENOTCONN;
1668         goto out;
1669     }
1670
1671     qgroup = find_qgroup_rb(fs_info, qgroupid);
1672     if (!qgroup) {
1673         ret = -ENOENT;
1674         goto out;
1675     }
1676
1677     spin_lock(&fs_info->qgroup_lock);
1678     if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1679         if (limit->max_rfer == CLEAR_VALUE) {
1680             qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1681             limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1682             qgroup->max_rfer = 0;
1683         } else {
1684             qgroup->max_rfer = limit->max_rfer;
1685         }
1686     }
1687     if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1688         if (limit->max_excl == CLEAR_VALUE) {
1689             qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1690             limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1691             qgroup->max_excl = 0;
1692         } else {
1693             qgroup->max_excl = limit->max_excl;
1694         }
1695     }
1696     if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1697         if (limit->rsv_rfer == CLEAR_VALUE) {
1698             qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1699             limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1700             qgroup->rsv_rfer = 0;
1701         } else {
1702             qgroup->rsv_rfer = limit->rsv_rfer;
1703         }
1704     }
1705     if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1706         if (limit->rsv_excl == CLEAR_VALUE) {
1707             qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1708             limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1709             qgroup->rsv_excl = 0;
1710         } else {
1711             qgroup->rsv_excl = limit->rsv_excl;
1712         }
1713     }
1714     qgroup->lim_flags |= limit->flags;
1715
1716     spin_unlock(&fs_info->qgroup_lock);
1717
1718     ret = update_qgroup_limit_item(trans, qgroup);
1719     if (ret) {
1720         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1721         btrfs_info(fs_info, "unable to update quota limit for %llu",
1722                qgroupid);
1723     }
1724
1725 out:
1726     mutex_unlock(&fs_info->qgroup_ioctl_lock);
1727     return ret;
1728 }
1729
1730 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1731                 struct btrfs_delayed_ref_root *delayed_refs,
1732                 struct btrfs_qgroup_extent_record *record)
1733 {
1734     struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1735     struct rb_node *parent_node = NULL;
1736     struct btrfs_qgroup_extent_record *entry;
1737     u64 bytenr = record->bytenr;
1738
1739     lockdep_assert_held(&delayed_refs->lock);
1740     trace_btrfs_qgroup_trace_extent(fs_info, record);
1741
1742     while (*p) {
1743         parent_node = *p;
1744         entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1745                  node);
1746         if (bytenr < entry->bytenr) {
1747             p = &(*p)->rb_left;
1748         } else if (bytenr > entry->bytenr) {
1749             p = &(*p)->rb_right;
1750         } else {
1751             if (record->data_rsv && !entry->data_rsv) {
1752                 entry->data_rsv = record->data_rsv;
1753                 entry->data_rsv_refroot =
1754                     record->data_rsv_refroot;
1755             }
1756             return 1;
1757         }
1758     }
1759
1760     rb_link_node(&record->node, parent_node, p);
1761     rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1762     return 0;
1763 }
1764
1765 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
1766                    struct btrfs_qgroup_extent_record *qrecord)
1767 {
1768     struct ulist *old_root;
1769     u64 bytenr = qrecord->bytenr;
1770     int ret;
1771
1772     /*
1773      * We are always called in a context where we are already holding a
1774      * transaction handle. Often we are called when adding a data delayed
1775      * reference from btrfs_truncate_inode_items() (truncating or unlinking),
1776      * in which case we will be holding a write lock on extent buffer from a
1777      * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
1778      * acquire fs_info->commit_root_sem, because that is a higher level lock
1779      * that must be acquired before locking any extent buffers.
1780      *
1781      * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
1782      * but we can't pass it a non-NULL transaction handle, because otherwise
1783      * it would not use commit roots and would lock extent buffers, causing
1784      * a deadlock if it ends up trying to read lock the same extent buffer
1785      * that was previously write locked at btrfs_truncate_inode_items().
1786      *
1787      * So pass a NULL transaction handle to btrfs_find_all_roots() and
1788      * explicitly tell it to not acquire the commit_root_sem - if we are
1789      * holding a transaction handle we don't need its protection.
1790      */
1791     ASSERT(trans != NULL);
1792
1793     ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
1794                    true);
1795     if (ret < 0) {
1796         trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1797         btrfs_warn(trans->fs_info,
1798 "error accounting new delayed refs extent (err code: %d), quota inconsistent",
1799             ret);
1800         return 0;
1801     }
1802
1803     /*
1804      * Here we don't need to get the lock of
1805      * trans->transaction->delayed_refs, since inserted qrecord won't
1806      * be deleted, only qrecord->node may be modified (new qrecord insert)
1807      *
1808      * So modifying qrecord->old_roots is safe here
1809      */
1810     qrecord->old_roots = old_root;
1811     return 0;
1812 }
1813
1814 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1815                   u64 num_bytes, gfp_t gfp_flag)
1816 {
1817     struct btrfs_fs_info *fs_info = trans->fs_info;
1818     struct btrfs_qgroup_extent_record *record;
1819     struct btrfs_delayed_ref_root *delayed_refs;
1820     int ret;
1821
1822     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1823         || bytenr == 0 || num_bytes == 0)
1824         return 0;
1825     record = kzalloc(sizeof(*record), gfp_flag);
1826     if (!record)
1827         return -ENOMEM;
1828
1829     delayed_refs = &trans->transaction->delayed_refs;
1830     record->bytenr = bytenr;
1831     record->num_bytes = num_bytes;
1832     record->old_roots = NULL;
1833
1834     spin_lock(&delayed_refs->lock);
1835     ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1836     spin_unlock(&delayed_refs->lock);
1837     if (ret > 0) {
1838         kfree(record);
1839         return 0;
1840     }
1841     return btrfs_qgroup_trace_extent_post(trans, record);
1842 }
1843
1844 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1845                   struct extent_buffer *eb)
1846 {
1847     struct btrfs_fs_info *fs_info = trans->fs_info;
1848     int nr = btrfs_header_nritems(eb);
1849     int i, extent_type, ret;
1850     struct btrfs_key key;
1851     struct btrfs_file_extent_item *fi;
1852     u64 bytenr, num_bytes;
1853
1854     /* We can be called directly from walk_up_proc() */
1855     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1856         return 0;
1857
1858     for (i = 0; i < nr; i++) {
1859         btrfs_item_key_to_cpu(eb, &key, i);
1860
1861         if (key.type != BTRFS_EXTENT_DATA_KEY)
1862             continue;
1863
1864         fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1865         /* filter out non qgroup-accountable extents  */
1866         extent_type = btrfs_file_extent_type(eb, fi);
1867
1868         if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1869             continue;
1870
1871         bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1872         if (!bytenr)
1873             continue;
1874
1875         num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1876
1877         ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
1878                         GFP_NOFS);
1879         if (ret)
1880             return ret;
1881     }
1882     cond_resched();
1883     return 0;
1884 }
1885
1886 /*
1887  * Walk up the tree from the bottom, freeing leaves and any interior
1888  * nodes which have had all slots visited. If a node (leaf or
1889  * interior) is freed, the node above it will have it's slot
1890  * incremented. The root node will never be freed.
1891  *
1892  * At the end of this function, we should have a path which has all
1893  * slots incremented to the next position for a search. If we need to
1894  * read a new node it will be NULL and the node above it will have the
1895  * correct slot selected for a later read.
1896  *
1897  * If we increment the root nodes slot counter past the number of
1898  * elements, 1 is returned to signal completion of the search.
1899  */
1900 static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
1901 {
1902     int level = 0;
1903     int nr, slot;
1904     struct extent_buffer *eb;
1905
1906     if (root_level == 0)
1907         return 1;
1908
1909     while (level <= root_level) {
1910         eb = path->nodes[level];
1911         nr = btrfs_header_nritems(eb);
1912         path->slots[level]++;
1913         slot = path->slots[level];
1914         if (slot >= nr || level == 0) {
1915             /*
1916              * Don't free the root -  we will detect this
1917              * condition after our loop and return a
1918              * positive value for caller to stop walking the tree.
1919              */
1920             if (level != root_level) {
1921                 btrfs_tree_unlock_rw(eb, path->locks[level]);
1922                 path->locks[level] = 0;
1923
1924                 free_extent_buffer(eb);
1925                 path->nodes[level] = NULL;
1926                 path->slots[level] = 0;
1927             }
1928         } else {
1929             /*
1930              * We have a valid slot to walk back down
1931              * from. Stop here so caller can process these
1932              * new nodes.
1933              */
1934             break;
1935         }
1936
1937         level++;
1938     }
1939
1940     eb = path->nodes[root_level];
1941     if (path->slots[root_level] >= btrfs_header_nritems(eb))
1942         return 1;
1943
1944     return 0;
1945 }
1946
1947 /*
1948  * Helper function to trace a subtree tree block swap.
1949  *
1950  * The swap will happen in highest tree block, but there may be a lot of
1951  * tree blocks involved.
1952  *
1953  * For example:
1954  *  OO = Old tree blocks
1955  *  NN = New tree blocks allocated during balance
1956  *
1957  *           File tree (257)                  Reloc tree for 257
1958  * L2              OO                                NN
1959  *               /    \                            /    \
1960  * L1          OO      OO (a)                    OO      NN (a)
1961  *            / \     / \                       / \     / \
1962  * L0       OO   OO OO   OO                   OO   OO NN   NN
1963  *                  (b)  (c)                          (b)  (c)
1964  *
1965  * When calling qgroup_trace_extent_swap(), we will pass:
1966  * @src_eb = OO(a)
1967  * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1968  * @dst_level = 0
1969  * @root_level = 1
1970  *
1971  * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1972  * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1973  *
1974  * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1975  *
1976  * 1) Tree search from @src_eb
1977  *    It should acts as a simplified btrfs_search_slot().
1978  *    The key for search can be extracted from @dst_path->nodes[dst_level]
1979  *    (first key).
1980  *
1981  * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1982  *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1983  *    They should be marked during previous (@dst_level = 1) iteration.
1984  *
1985  * 3) Mark file extents in leaves dirty
1986  *    We don't have good way to pick out new file extents only.
1987  *    So we still follow the old method by scanning all file extents in
1988  *    the leave.
1989  *
1990  * This function can free us from keeping two paths, thus later we only need
1991  * to care about how to iterate all new tree blocks in reloc tree.
1992  */
1993 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1994                     struct extent_buffer *src_eb,
1995                     struct btrfs_path *dst_path,
1996                     int dst_level, int root_level,
1997                     bool trace_leaf)
1998 {
1999     struct btrfs_key key;
2000     struct btrfs_path *src_path;
2001     struct btrfs_fs_info *fs_info = trans->fs_info;
2002     u32 nodesize = fs_info->nodesize;
2003     int cur_level = root_level;
2004     int ret;
2005
2006     BUG_ON(dst_level > root_level);
2007     /* Level mismatch */
2008     if (btrfs_header_level(src_eb) != root_level)
2009         return -EINVAL;
2010
2011     src_path = btrfs_alloc_path();
2012     if (!src_path) {
2013         ret = -ENOMEM;
2014         goto out;
2015     }
2016
2017     if (dst_level)
2018         btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2019     else
2020         btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2021
2022     /* For src_path */
2023     atomic_inc(&src_eb->refs);
2024     src_path->nodes[root_level] = src_eb;
2025     src_path->slots[root_level] = dst_path->slots[root_level];
2026     src_path->locks[root_level] = 0;
2027
2028     /* A simplified version of btrfs_search_slot() */
2029     while (cur_level >= dst_level) {
2030         struct btrfs_key src_key;
2031         struct btrfs_key dst_key;
2032
2033         if (src_path->nodes[cur_level] == NULL) {
2034             struct extent_buffer *eb;
2035             int parent_slot;
2036
2037             eb = src_path->nodes[cur_level + 1];
2038             parent_slot = src_path->slots[cur_level + 1];
2039
2040             eb = btrfs_read_node_slot(eb, parent_slot);
2041             if (IS_ERR(eb)) {
2042                 ret = PTR_ERR(eb);
2043                 goto out;
2044             }
2045
2046             src_path->nodes[cur_level] = eb;
2047
2048             btrfs_tree_read_lock(eb);
2049             src_path->locks[cur_level] = BTRFS_READ_LOCK;
2050         }
2051
2052         src_path->slots[cur_level] = dst_path->slots[cur_level];
2053         if (cur_level) {
2054             btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2055                     &dst_key, dst_path->slots[cur_level]);
2056             btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2057                     &src_key, src_path->slots[cur_level]);
2058         } else {
2059             btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2060                     &dst_key, dst_path->slots[cur_level]);
2061             btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2062                     &src_key, src_path->slots[cur_level]);
2063         }
2064         /* Content mismatch, something went wrong */
2065         if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2066             ret = -ENOENT;
2067             goto out;
2068         }
2069         cur_level--;
2070     }
2071
2072     /*
2073      * Now both @dst_path and @src_path have been populated, record the tree
2074      * blocks for qgroup accounting.
2075      */
2076     ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2077             nodesize, GFP_NOFS);
2078     if (ret < 0)
2079         goto out;
2080     ret = btrfs_qgroup_trace_extent(trans,
2081             dst_path->nodes[dst_level]->start,
2082             nodesize, GFP_NOFS);
2083     if (ret < 0)
2084         goto out;
2085
2086     /* Record leaf file extents */
2087     if (dst_level == 0 && trace_leaf) {
2088         ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2089         if (ret < 0)
2090             goto out;
2091         ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2092     }
2093 out:
2094     btrfs_free_path(src_path);
2095     return ret;
2096 }
2097
2098 /*
2099  * Helper function to do recursive generation-aware depth-first search, to
2100  * locate all new tree blocks in a subtree of reloc tree.
2101  *
2102  * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2103  *         reloc tree
2104  * L2         NN (a)
2105  *          /    \
2106  * L1    OO        NN (b)
2107  *      /  \      /  \
2108  * L0  OO  OO    OO  NN
2109  *               (c) (d)
2110  * If we pass:
2111  * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2112  * @cur_level = 1
2113  * @root_level = 1
2114  *
2115  * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2116  * above tree blocks along with their counter parts in file tree.
2117  * While during search, old tree blocks OO(c) will be skipped as tree block swap
2118  * won't affect OO(c).
2119  */
2120 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2121                        struct extent_buffer *src_eb,
2122                        struct btrfs_path *dst_path,
2123                        int cur_level, int root_level,
2124                        u64 last_snapshot, bool trace_leaf)
2125 {
2126     struct btrfs_fs_info *fs_info = trans->fs_info;
2127     struct extent_buffer *eb;
2128     bool need_cleanup = false;
2129     int ret = 0;
2130     int i;
2131
2132     /* Level sanity check */
2133     if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2134         root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2135         root_level < cur_level) {
2136         btrfs_err_rl(fs_info,
2137             "%s: bad levels, cur_level=%d root_level=%d",
2138             __func__, cur_level, root_level);
2139         return -EUCLEAN;
2140     }
2141
2142     /* Read the tree block if needed */
2143     if (dst_path->nodes[cur_level] == NULL) {
2144         int parent_slot;
2145         u64 child_gen;
2146
2147         /*
2148          * dst_path->nodes[root_level] must be initialized before
2149          * calling this function.
2150          */
2151         if (cur_level == root_level) {
2152             btrfs_err_rl(fs_info,
2153     "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2154                 __func__, root_level, root_level, cur_level);
2155             return -EUCLEAN;
2156         }
2157
2158         /*
2159          * We need to get child blockptr/gen from parent before we can
2160          * read it.
2161           */
2162         eb = dst_path->nodes[cur_level + 1];
2163         parent_slot = dst_path->slots[cur_level + 1];
2164         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2165
2166         /* This node is old, no need to trace */
2167         if (child_gen < last_snapshot)
2168             goto out;
2169
2170         eb = btrfs_read_node_slot(eb, parent_slot);
2171         if (IS_ERR(eb)) {
2172             ret = PTR_ERR(eb);
2173             goto out;
2174         }
2175
2176         dst_path->nodes[cur_level] = eb;
2177         dst_path->slots[cur_level] = 0;
2178
2179         btrfs_tree_read_lock(eb);
2180         dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2181         need_cleanup = true;
2182     }
2183
2184     /* Now record this tree block and its counter part for qgroups */
2185     ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2186                        root_level, trace_leaf);
2187     if (ret < 0)
2188         goto cleanup;
2189
2190     eb = dst_path->nodes[cur_level];
2191
2192     if (cur_level > 0) {
2193         /* Iterate all child tree blocks */
2194         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2195             /* Skip old tree blocks as they won't be swapped */
2196             if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2197                 continue;
2198             dst_path->slots[cur_level] = i;
2199
2200             /* Recursive call (at most 7 times) */
2201             ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2202                     dst_path, cur_level - 1, root_level,
2203                     last_snapshot, trace_leaf);
2204             if (ret < 0)
2205                 goto cleanup;
2206         }
2207     }
2208
2209 cleanup:
2210     if (need_cleanup) {
2211         /* Clean up */
2212         btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2213                      dst_path->locks[cur_level]);
2214         free_extent_buffer(dst_path->nodes[cur_level]);
2215         dst_path->nodes[cur_level] = NULL;
2216         dst_path->slots[cur_level] = 0;
2217         dst_path->locks[cur_level] = 0;
2218     }
2219 out:
2220     return ret;
2221 }
2222
2223 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2224                 struct extent_buffer *src_eb,
2225                 struct extent_buffer *dst_eb,
2226                 u64 last_snapshot, bool trace_leaf)
2227 {
2228     struct btrfs_fs_info *fs_info = trans->fs_info;
2229     struct btrfs_path *dst_path = NULL;
2230     int level;
2231     int ret;
2232
2233     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2234         return 0;
2235
2236     /* Wrong parameter order */
2237     if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2238         btrfs_err_rl(fs_info,
2239         "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2240                  btrfs_header_generation(src_eb),
2241                  btrfs_header_generation(dst_eb));
2242         return -EUCLEAN;
2243     }
2244
2245     if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2246         ret = -EIO;
2247         goto out;
2248     }
2249
2250     level = btrfs_header_level(dst_eb);
2251     dst_path = btrfs_alloc_path();
2252     if (!dst_path) {
2253         ret = -ENOMEM;
2254         goto out;
2255     }
2256     /* For dst_path */
2257     atomic_inc(&dst_eb->refs);
2258     dst_path->nodes[level] = dst_eb;
2259     dst_path->slots[level] = 0;
2260     dst_path->locks[level] = 0;
2261
2262     /* Do the generation aware breadth-first search */
2263     ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2264                           level, last_snapshot, trace_leaf);
2265     if (ret < 0)
2266         goto out;
2267     ret = 0;
2268
2269 out:
2270     btrfs_free_path(dst_path);
2271     if (ret < 0)
2272         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2273     return ret;
2274 }
2275
2276 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2277                    struct extent_buffer *root_eb,
2278                    u64 root_gen, int root_level)
2279 {
2280     struct btrfs_fs_info *fs_info = trans->fs_info;
2281     int ret = 0;
2282     int level;
2283     struct extent_buffer *eb = root_eb;
2284     struct btrfs_path *path = NULL;
2285
2286     BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2287     BUG_ON(root_eb == NULL);
2288
2289     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2290         return 0;
2291
2292     if (!extent_buffer_uptodate(root_eb)) {
2293         ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
2294         if (ret)
2295             goto out;
2296     }
2297
2298     if (root_level == 0) {
2299         ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2300         goto out;
2301     }
2302
2303     path = btrfs_alloc_path();
2304     if (!path)
2305         return -ENOMEM;
2306
2307     /*
2308      * Walk down the tree.  Missing extent blocks are filled in as
2309      * we go. Metadata is accounted every time we read a new
2310      * extent block.
2311      *
2312      * When we reach a leaf, we account for file extent items in it,
2313      * walk back up the tree (adjusting slot pointers as we go)
2314      * and restart the search process.
2315      */
2316     atomic_inc(&root_eb->refs); /* For path */
2317     path->nodes[root_level] = root_eb;
2318     path->slots[root_level] = 0;
2319     path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2320 walk_down:
2321     level = root_level;
2322     while (level >= 0) {
2323         if (path->nodes[level] == NULL) {
2324             int parent_slot;
2325             u64 child_bytenr;
2326
2327             /*
2328              * We need to get child blockptr from parent before we
2329              * can read it.
2330               */
2331             eb = path->nodes[level + 1];
2332             parent_slot = path->slots[level + 1];
2333             child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2334
2335             eb = btrfs_read_node_slot(eb, parent_slot);
2336             if (IS_ERR(eb)) {
2337                 ret = PTR_ERR(eb);
2338                 goto out;
2339             }
2340
2341             path->nodes[level] = eb;
2342             path->slots[level] = 0;
2343
2344             btrfs_tree_read_lock(eb);
2345             path->locks[level] = BTRFS_READ_LOCK;
2346
2347             ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2348                             fs_info->nodesize,
2349                             GFP_NOFS);
2350             if (ret)
2351                 goto out;
2352         }
2353
2354         if (level == 0) {
2355             ret = btrfs_qgroup_trace_leaf_items(trans,
2356                                 path->nodes[level]);
2357             if (ret)
2358                 goto out;
2359
2360             /* Nonzero return here means we completed our search */
2361             ret = adjust_slots_upwards(path, root_level);
2362             if (ret)
2363                 break;
2364
2365             /* Restart search with new slots */
2366             goto walk_down;
2367         }
2368
2369         level--;
2370     }
2371
2372     ret = 0;
2373 out:
2374     btrfs_free_path(path);
2375
2376     return ret;
2377 }
2378
2379 #define UPDATE_NEW  0
2380 #define UPDATE_OLD  1
2381 /*
2382  * Walk all of the roots that points to the bytenr and adjust their refcnts.
2383  */
2384 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2385                 struct ulist *roots, struct ulist *tmp,
2386                 struct ulist *qgroups, u64 seq, int update_old)
2387 {
2388     struct ulist_node *unode;
2389     struct ulist_iterator uiter;
2390     struct ulist_node *tmp_unode;
2391     struct ulist_iterator tmp_uiter;
2392     struct btrfs_qgroup *qg;
2393     int ret = 0;
2394
2395     if (!roots)
2396         return 0;
2397     ULIST_ITER_INIT(&uiter);
2398     while ((unode = ulist_next(roots, &uiter))) {
2399         qg = find_qgroup_rb(fs_info, unode->val);
2400         if (!qg)
2401             continue;
2402
2403         ulist_reinit(tmp);
2404         ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
2405                 GFP_ATOMIC);
2406         if (ret < 0)
2407             return ret;
2408         ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
2409         if (ret < 0)
2410             return ret;
2411         ULIST_ITER_INIT(&tmp_uiter);
2412         while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
2413             struct btrfs_qgroup_list *glist;
2414
2415             qg = unode_aux_to_qgroup(tmp_unode);
2416             if (update_old)
2417                 btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2418             else
2419                 btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2420             list_for_each_entry(glist, &qg->groups, next_group) {
2421                 ret = ulist_add(qgroups, glist->group->qgroupid,
2422                         qgroup_to_aux(glist->group),
2423                         GFP_ATOMIC);
2424                 if (ret < 0)
2425                     return ret;
2426                 ret = ulist_add(tmp, glist->group->qgroupid,
2427                         qgroup_to_aux(glist->group),
2428                         GFP_ATOMIC);
2429                 if (ret < 0)
2430                     return ret;
2431             }
2432         }
2433     }
2434     return 0;
2435 }
2436
2437 /*
2438  * Update qgroup rfer/excl counters.
2439  * Rfer update is easy, codes can explain themselves.
2440  *
2441  * Excl update is tricky, the update is split into 2 parts.
2442  * Part 1: Possible exclusive <-> sharing detect:
2443  *  |   A   |   !A  |
2444  *  -------------------------------------
2445  *  B   |   *   |   -   |
2446  *  -------------------------------------
2447  *  !B  |   +   |   **  |
2448  *  -------------------------------------
2449  *
2450  * Conditions:
2451  * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2452  * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2453  * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2454  * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2455  *
2456  * Results:
2457  * +: Possible sharing -> exclusive -: Possible exclusive -> sharing
2458  * *: Definitely not changed.       **: Possible unchanged.
2459  *
2460  * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2461  *
2462  * To make the logic clear, we first use condition A and B to split
2463  * combination into 4 results.
2464  *
2465  * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2466  * only on variant maybe 0.
2467  *
2468  * Lastly, check result **, since there are 2 variants maybe 0, split them
2469  * again(2x2).
2470  * But this time we don't need to consider other things, the codes and logic
2471  * is easy to understand now.
2472  */
2473 static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
2474                   struct ulist *qgroups,
2475                   u64 nr_old_roots,
2476                   u64 nr_new_roots,
2477                   u64 num_bytes, u64 seq)
2478 {
2479     struct ulist_node *unode;
2480     struct ulist_iterator uiter;
2481     struct btrfs_qgroup *qg;
2482     u64 cur_new_count, cur_old_count;
2483
2484     ULIST_ITER_INIT(&uiter);
2485     while ((unode = ulist_next(qgroups, &uiter))) {
2486         bool dirty = false;
2487
2488         qg = unode_aux_to_qgroup(unode);
2489         cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2490         cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2491
2492         trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2493                          cur_new_count);
2494
2495         /* Rfer update part */
2496         if (cur_old_count == 0 && cur_new_count > 0) {
2497             qg->rfer += num_bytes;
2498             qg->rfer_cmpr += num_bytes;
2499             dirty = true;
2500         }
2501         if (cur_old_count > 0 && cur_new_count == 0) {
2502             qg->rfer -= num_bytes;
2503             qg->rfer_cmpr -= num_bytes;
2504             dirty = true;
2505         }
2506
2507         /* Excl update part */
2508         /* Exclusive/none -> shared case */
2509         if (cur_old_count == nr_old_roots &&
2510             cur_new_count < nr_new_roots) {
2511             /* Exclusive -> shared */
2512             if (cur_old_count != 0) {
2513                 qg->excl -= num_bytes;
2514                 qg->excl_cmpr -= num_bytes;
2515                 dirty = true;
2516             }
2517         }
2518
2519         /* Shared -> exclusive/none case */
2520         if (cur_old_count < nr_old_roots &&
2521             cur_new_count == nr_new_roots) {
2522             /* Shared->exclusive */
2523             if (cur_new_count != 0) {
2524                 qg->excl += num_bytes;
2525                 qg->excl_cmpr += num_bytes;
2526                 dirty = true;
2527             }
2528         }
2529
2530         /* Exclusive/none -> exclusive/none case */
2531         if (cur_old_count == nr_old_roots &&
2532             cur_new_count == nr_new_roots) {
2533             if (cur_old_count == 0) {
2534                 /* None -> exclusive/none */
2535
2536                 if (cur_new_count != 0) {
2537                     /* None -> exclusive */
2538                     qg->excl += num_bytes;
2539                     qg->excl_cmpr += num_bytes;
2540                     dirty = true;
2541                 }
2542                 /* None -> none, nothing changed */
2543             } else {
2544                 /* Exclusive -> exclusive/none */
2545
2546                 if (cur_new_count == 0) {
2547                     /* Exclusive -> none */
2548                     qg->excl -= num_bytes;
2549                     qg->excl_cmpr -= num_bytes;
2550                     dirty = true;
2551                 }
2552                 /* Exclusive -> exclusive, nothing changed */
2553             }
2554         }
2555
2556         if (dirty)
2557             qgroup_dirty(fs_info, qg);
2558     }
2559     return 0;
2560 }
2561
2562 /*
2563  * Check if the @roots potentially is a list of fs tree roots
2564  *
2565  * Return 0 for definitely not a fs/subvol tree roots ulist
2566  * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2567  *          one as well)
2568  */
2569 static int maybe_fs_roots(struct ulist *roots)
2570 {
2571     struct ulist_node *unode;
2572     struct ulist_iterator uiter;
2573
2574     /* Empty one, still possible for fs roots */
2575     if (!roots || roots->nnodes == 0)
2576         return 1;
2577
2578     ULIST_ITER_INIT(&uiter);
2579     unode = ulist_next(roots, &uiter);
2580     if (!unode)
2581         return 1;
2582
2583     /*
2584      * If it contains fs tree roots, then it must belong to fs/subvol
2585      * trees.
2586      * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2587      */
2588     return is_fstree(unode->val);
2589 }
2590
2591 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2592                 u64 num_bytes, struct ulist *old_roots,
2593                 struct ulist *new_roots)
2594 {
2595     struct btrfs_fs_info *fs_info = trans->fs_info;
2596     struct ulist *qgroups = NULL;
2597     struct ulist *tmp = NULL;
2598     u64 seq;
2599     u64 nr_new_roots = 0;
2600     u64 nr_old_roots = 0;
2601     int ret = 0;
2602
2603     /*
2604      * If quotas get disabled meanwhile, the resources need to be freed and
2605      * we can't just exit here.
2606      */
2607     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2608         goto out_free;
2609
2610     if (new_roots) {
2611         if (!maybe_fs_roots(new_roots))
2612             goto out_free;
2613         nr_new_roots = new_roots->nnodes;
2614     }
2615     if (old_roots) {
2616         if (!maybe_fs_roots(old_roots))
2617             goto out_free;
2618         nr_old_roots = old_roots->nnodes;
2619     }
2620
2621     /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2622     if (nr_old_roots == 0 && nr_new_roots == 0)
2623         goto out_free;
2624
2625     BUG_ON(!fs_info->quota_root);
2626
2627     trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2628                     num_bytes, nr_old_roots, nr_new_roots);
2629
2630     qgroups = ulist_alloc(GFP_NOFS);
2631     if (!qgroups) {
2632         ret = -ENOMEM;
2633         goto out_free;
2634     }
2635     tmp = ulist_alloc(GFP_NOFS);
2636     if (!tmp) {
2637         ret = -ENOMEM;
2638         goto out_free;
2639     }
2640
2641     mutex_lock(&fs_info->qgroup_rescan_lock);
2642     if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2643         if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2644             mutex_unlock(&fs_info->qgroup_rescan_lock);
2645             ret = 0;
2646             goto out_free;
2647         }
2648     }
2649     mutex_unlock(&fs_info->qgroup_rescan_lock);
2650
2651     spin_lock(&fs_info->qgroup_lock);
2652     seq = fs_info->qgroup_seq;
2653
2654     /* Update old refcnts using old_roots */
2655     ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
2656                    UPDATE_OLD);
2657     if (ret < 0)
2658         goto out;
2659
2660     /* Update new refcnts using new_roots */
2661     ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
2662                    UPDATE_NEW);
2663     if (ret < 0)
2664         goto out;
2665
2666     qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
2667                    num_bytes, seq);
2668
2669     /*
2670      * Bump qgroup_seq to avoid seq overlap
2671      */
2672     fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2673 out:
2674     spin_unlock(&fs_info->qgroup_lock);
2675 out_free:
2676     ulist_free(tmp);
2677     ulist_free(qgroups);
2678     ulist_free(old_roots);
2679     ulist_free(new_roots);
2680     return ret;
2681 }
2682
2683 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2684 {
2685     struct btrfs_fs_info *fs_info = trans->fs_info;
2686     struct btrfs_qgroup_extent_record *record;
2687     struct btrfs_delayed_ref_root *delayed_refs;
2688     struct ulist *new_roots = NULL;
2689     struct rb_node *node;
2690     u64 num_dirty_extents = 0;
2691     u64 qgroup_to_skip;
2692     int ret = 0;
2693
2694     delayed_refs = &trans->transaction->delayed_refs;
2695     qgroup_to_skip = delayed_refs->qgroup_to_skip;
2696     while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2697         record = rb_entry(node, struct btrfs_qgroup_extent_record,
2698                   node);
2699
2700         num_dirty_extents++;
2701         trace_btrfs_qgroup_account_extents(fs_info, record);
2702
2703         if (!ret) {
2704             /*
2705              * Old roots should be searched when inserting qgroup
2706              * extent record
2707              */
2708             if (WARN_ON(!record->old_roots)) {
2709                 /* Search commit root to find old_roots */
2710                 ret = btrfs_find_all_roots(NULL, fs_info,
2711                         record->bytenr, 0,
2712                         &record->old_roots, false);
2713                 if (ret < 0)
2714                     goto cleanup;
2715             }
2716
2717             /* Free the reserved data space */
2718             btrfs_qgroup_free_refroot(fs_info,
2719                     record->data_rsv_refroot,
2720                     record->data_rsv,
2721                     BTRFS_QGROUP_RSV_DATA);
2722             /*
2723              * Use BTRFS_SEQ_LAST as time_seq to do special search,
2724              * which doesn't lock tree or delayed_refs and search
2725              * current root. It's safe inside commit_transaction().
2726              */
2727             ret = btrfs_find_all_roots(trans, fs_info,
2728                record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
2729             if (ret < 0)
2730                 goto cleanup;
2731             if (qgroup_to_skip) {
2732                 ulist_del(new_roots, qgroup_to_skip, 0);
2733                 ulist_del(record->old_roots, qgroup_to_skip,
2734                       0);
2735             }
2736             ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2737                               record->num_bytes,
2738                               record->old_roots,
2739                               new_roots);
2740             record->old_roots = NULL;
2741             new_roots = NULL;
2742         }
2743 cleanup:
2744         ulist_free(record->old_roots);
2745         ulist_free(new_roots);
2746         new_roots = NULL;
2747         rb_erase(node, &delayed_refs->dirty_extent_root);
2748         kfree(record);
2749
2750     }
2751     trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2752                        num_dirty_extents);
2753     return ret;
2754 }
2755
2756 /*
2757  * called from commit_transaction. Writes all changed qgroups to disk.
2758  */
2759 int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2760 {
2761     struct btrfs_fs_info *fs_info = trans->fs_info;
2762     int ret = 0;
2763
2764     if (!fs_info->quota_root)
2765         return ret;
2766
2767     spin_lock(&fs_info->qgroup_lock);
2768     while (!list_empty(&fs_info->dirty_qgroups)) {
2769         struct btrfs_qgroup *qgroup;
2770         qgroup = list_first_entry(&fs_info->dirty_qgroups,
2771                       struct btrfs_qgroup, dirty);
2772         list_del_init(&qgroup->dirty);
2773         spin_unlock(&fs_info->qgroup_lock);
2774         ret = update_qgroup_info_item(trans, qgroup);
2775         if (ret)
2776             fs_info->qgroup_flags |=
2777                     BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2778         ret = update_qgroup_limit_item(trans, qgroup);
2779         if (ret)
2780             fs_info->qgroup_flags |=
2781                     BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2782         spin_lock(&fs_info->qgroup_lock);
2783     }
2784     if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2785         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2786     else
2787         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2788     spin_unlock(&fs_info->qgroup_lock);
2789
2790     ret = update_qgroup_status_item(trans);
2791     if (ret)
2792         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2793
2794     return ret;
2795 }
2796
2797 /*
2798  * Copy the accounting information between qgroups. This is necessary
2799  * when a snapshot or a subvolume is created. Throwing an error will
2800  * cause a transaction abort so we take extra care here to only error
2801  * when a readonly fs is a reasonable outcome.
2802  */
2803 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2804              u64 objectid, struct btrfs_qgroup_inherit *inherit)
2805 {
2806     int ret = 0;
2807     int i;
2808     u64 *i_qgroups;
2809     bool committing = false;
2810     struct btrfs_fs_info *fs_info = trans->fs_info;
2811     struct btrfs_root *quota_root;
2812     struct btrfs_qgroup *srcgroup;
2813     struct btrfs_qgroup *dstgroup;
2814     bool need_rescan = false;
2815     u32 level_size = 0;
2816     u64 nums;
2817
2818     /*
2819      * There are only two callers of this function.
2820      *
2821      * One in create_subvol() in the ioctl context, which needs to hold
2822      * the qgroup_ioctl_lock.
2823      *
2824      * The other one in create_pending_snapshot() where no other qgroup
2825      * code can modify the fs as they all need to either start a new trans
2826      * or hold a trans handler, thus we don't need to hold
2827      * qgroup_ioctl_lock.
2828      * This would avoid long and complex lock chain and make lockdep happy.
2829      */
2830     spin_lock(&fs_info->trans_lock);
2831     if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2832         committing = true;
2833     spin_unlock(&fs_info->trans_lock);
2834
2835     if (!committing)
2836         mutex_lock(&fs_info->qgroup_ioctl_lock);
2837     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2838         goto out;
2839
2840     quota_root = fs_info->quota_root;
2841     if (!quota_root) {
2842         ret = -EINVAL;
2843         goto out;
2844     }
2845
2846     if (inherit) {
2847         i_qgroups = (u64 *)(inherit + 1);
2848         nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2849                2 * inherit->num_excl_copies;
2850         for (i = 0; i < nums; ++i) {
2851             srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2852
2853             /*
2854              * Zero out invalid groups so we can ignore
2855              * them later.
2856              */
2857             if (!srcgroup ||
2858                 ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
2859                 *i_qgroups = 0ULL;
2860
2861             ++i_qgroups;
2862         }
2863     }
2864
2865     /*
2866      * create a tracking group for the subvol itself
2867      */
2868     ret = add_qgroup_item(trans, quota_root, objectid);
2869     if (ret)
2870         goto out;
2871
2872     /*
2873      * add qgroup to all inherited groups
2874      */
2875     if (inherit) {
2876         i_qgroups = (u64 *)(inherit + 1);
2877         for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
2878             if (*i_qgroups == 0)
2879                 continue;
2880             ret = add_qgroup_relation_item(trans, objectid,
2881                                *i_qgroups);
2882             if (ret && ret != -EEXIST)
2883                 goto out;
2884             ret = add_qgroup_relation_item(trans, *i_qgroups,
2885                                objectid);
2886             if (ret && ret != -EEXIST)
2887                 goto out;
2888         }
2889         ret = 0;
2890     }
2891
2892
2893     spin_lock(&fs_info->qgroup_lock);
2894
2895     dstgroup = add_qgroup_rb(fs_info, objectid);
2896     if (IS_ERR(dstgroup)) {
2897         ret = PTR_ERR(dstgroup);
2898         goto unlock;
2899     }
2900
2901     if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2902         dstgroup->lim_flags = inherit->lim.flags;
2903         dstgroup->max_rfer = inherit->lim.max_rfer;
2904         dstgroup->max_excl = inherit->lim.max_excl;
2905         dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2906         dstgroup->rsv_excl = inherit->lim.rsv_excl;
2907
2908         ret = update_qgroup_limit_item(trans, dstgroup);
2909         if (ret) {
2910             fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2911             btrfs_info(fs_info,
2912                    "unable to update quota limit for %llu",
2913                    dstgroup->qgroupid);
2914             goto unlock;
2915         }
2916     }
2917
2918     if (srcid) {
2919         srcgroup = find_qgroup_rb(fs_info, srcid);
2920         if (!srcgroup)
2921             goto unlock;
2922
2923         /*
2924          * We call inherit after we clone the root in order to make sure
2925          * our counts don't go crazy, so at this point the only
2926          * difference between the two roots should be the root node.
2927          */
2928         level_size = fs_info->nodesize;
2929         dstgroup->rfer = srcgroup->rfer;
2930         dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2931         dstgroup->excl = level_size;
2932         dstgroup->excl_cmpr = level_size;
2933         srcgroup->excl = level_size;
2934         srcgroup->excl_cmpr = level_size;
2935
2936         /* inherit the limit info */
2937         dstgroup->lim_flags = srcgroup->lim_flags;
2938         dstgroup->max_rfer = srcgroup->max_rfer;
2939         dstgroup->max_excl = srcgroup->max_excl;
2940         dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2941         dstgroup->rsv_excl = srcgroup->rsv_excl;
2942
2943         qgroup_dirty(fs_info, dstgroup);
2944         qgroup_dirty(fs_info, srcgroup);
2945     }
2946
2947     if (!inherit)
2948         goto unlock;
2949
2950     i_qgroups = (u64 *)(inherit + 1);
2951     for (i = 0; i < inherit->num_qgroups; ++i) {
2952         if (*i_qgroups) {
2953             ret = add_relation_rb(fs_info, objectid, *i_qgroups);
2954             if (ret)
2955                 goto unlock;
2956         }
2957         ++i_qgroups;
2958
2959         /*
2960          * If we're doing a snapshot, and adding the snapshot to a new
2961          * qgroup, the numbers are guaranteed to be incorrect.
2962          */
2963         if (srcid)
2964             need_rescan = true;
2965     }
2966
2967     for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
2968         struct btrfs_qgroup *src;
2969         struct btrfs_qgroup *dst;
2970
2971         if (!i_qgroups[0] || !i_qgroups[1])
2972             continue;
2973
2974         src = find_qgroup_rb(fs_info, i_qgroups[0]);
2975         dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2976
2977         if (!src || !dst) {
2978             ret = -EINVAL;
2979             goto unlock;
2980         }
2981
2982         dst->rfer = src->rfer - level_size;
2983         dst->rfer_cmpr = src->rfer_cmpr - level_size;
2984
2985         /* Manually tweaking numbers certainly needs a rescan */
2986         need_rescan = true;
2987     }
2988     for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
2989         struct btrfs_qgroup *src;
2990         struct btrfs_qgroup *dst;
2991
2992         if (!i_qgroups[0] || !i_qgroups[1])
2993             continue;
2994
2995         src = find_qgroup_rb(fs_info, i_qgroups[0]);
2996         dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2997
2998         if (!src || !dst) {
2999             ret = -EINVAL;
3000             goto unlock;
3001         }
3002
3003         dst->excl = src->excl + level_size;
3004         dst->excl_cmpr = src->excl_cmpr + level_size;
3005         need_rescan = true;
3006     }
3007
3008 unlock:
3009     spin_unlock(&fs_info->qgroup_lock);
3010     if (!ret)
3011         ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
3012 out:
3013     if (!committing)
3014         mutex_unlock(&fs_info->qgroup_ioctl_lock);
3015     if (need_rescan)
3016         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3017     return ret;
3018 }
3019
3020 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
3021 {
3022     if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
3023         qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
3024         return false;
3025
3026     if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
3027         qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
3028         return false;
3029
3030     return true;
3031 }
3032
3033 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
3034               enum btrfs_qgroup_rsv_type type)
3035 {
3036     struct btrfs_qgroup *qgroup;
3037     struct btrfs_fs_info *fs_info = root->fs_info;
3038     u64 ref_root = root->root_key.objectid;
3039     int ret = 0;
3040     struct ulist_node *unode;
3041     struct ulist_iterator uiter;
3042
3043     if (!is_fstree(ref_root))
3044         return 0;
3045
3046     if (num_bytes == 0)
3047         return 0;
3048
3049     if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
3050         capable(CAP_SYS_RESOURCE))
3051         enforce = false;
3052
3053     spin_lock(&fs_info->qgroup_lock);
3054     if (!fs_info->quota_root)
3055         goto out;
3056
3057     qgroup = find_qgroup_rb(fs_info, ref_root);
3058     if (!qgroup)
3059         goto out;
3060
3061     /*
3062      * in a first step, we check all affected qgroups if any limits would
3063      * be exceeded
3064      */
3065     ulist_reinit(fs_info->qgroup_ulist);
3066     ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3067             qgroup_to_aux(qgroup), GFP_ATOMIC);
3068     if (ret < 0)
3069         goto out;
3070     ULIST_ITER_INIT(&uiter);
3071     while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3072         struct btrfs_qgroup *qg;
3073         struct btrfs_qgroup_list *glist;
3074
3075         qg = unode_aux_to_qgroup(unode);
3076
3077         if (enforce && !qgroup_check_limits(qg, num_bytes)) {
3078             ret = -EDQUOT;
3079             goto out;
3080         }
3081
3082         list_for_each_entry(glist, &qg->groups, next_group) {
3083             ret = ulist_add(fs_info->qgroup_ulist,
3084                     glist->group->qgroupid,
3085                     qgroup_to_aux(glist->group), GFP_ATOMIC);
3086             if (ret < 0)
3087                 goto out;
3088         }
3089     }
3090     ret = 0;
3091     /*
3092      * no limits exceeded, now record the reservation into all qgroups
3093      */
3094     ULIST_ITER_INIT(&uiter);
3095     while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3096         struct btrfs_qgroup *qg;
3097
3098         qg = unode_aux_to_qgroup(unode);
3099
3100         qgroup_rsv_add(fs_info, qg, num_bytes, type);
3101     }
3102
3103 out:
3104     spin_unlock(&fs_info->qgroup_lock);
3105     return ret;
3106 }
3107
3108 /*
3109  * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3110  * qgroup).
3111  *
3112  * Will handle all higher level qgroup too.
3113  *
3114  * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3115  * This special case is only used for META_PERTRANS type.
3116  */
3117 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3118                    u64 ref_root, u64 num_bytes,
3119                    enum btrfs_qgroup_rsv_type type)
3120 {
3121     struct btrfs_qgroup *qgroup;
3122     struct ulist_node *unode;
3123     struct ulist_iterator uiter;
3124     int ret = 0;
3125
3126     if (!is_fstree(ref_root))
3127         return;
3128
3129     if (num_bytes == 0)
3130         return;
3131
3132     if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3133         WARN(1, "%s: Invalid type to free", __func__);
3134         return;
3135     }
3136     spin_lock(&fs_info->qgroup_lock);
3137
3138     if (!fs_info->quota_root)
3139         goto out;
3140
3141     qgroup = find_qgroup_rb(fs_info, ref_root);
3142     if (!qgroup)
3143         goto out;
3144
3145     if (num_bytes == (u64)-1)
3146         /*
3147          * We're freeing all pertrans rsv, get reserved value from
3148          * level 0 qgroup as real num_bytes to free.
3149          */
3150         num_bytes = qgroup->rsv.values[type];
3151
3152     ulist_reinit(fs_info->qgroup_ulist);
3153     ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3154             qgroup_to_aux(qgroup), GFP_ATOMIC);
3155     if (ret < 0)
3156         goto out;
3157     ULIST_ITER_INIT(&uiter);
3158     while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3159         struct btrfs_qgroup *qg;
3160         struct btrfs_qgroup_list *glist;
3161
3162         qg = unode_aux_to_qgroup(unode);
3163
3164         qgroup_rsv_release(fs_info, qg, num_bytes, type);
3165
3166         list_for_each_entry(glist, &qg->groups, next_group) {
3167             ret = ulist_add(fs_info->qgroup_ulist,
3168                     glist->group->qgroupid,
3169                     qgroup_to_aux(glist->group), GFP_ATOMIC);
3170             if (ret < 0)
3171                 goto out;
3172         }
3173     }
3174
3175 out:
3176     spin_unlock(&fs_info->qgroup_lock);
3177 }
3178
3179 /*
3180  * Check if the leaf is the last leaf. Which means all node pointers
3181  * are at their last position.
3182  */
3183 static bool is_last_leaf(struct btrfs_path *path)
3184 {
3185     int i;
3186
3187     for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3188         if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3189             return false;
3190     }
3191     return true;
3192 }
3193
3194 /*
3195  * returns < 0 on error, 0 when more leafs are to be scanned.
3196  * returns 1 when done.
3197  */
3198 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3199                   struct btrfs_path *path)
3200 {
3201     struct btrfs_fs_info *fs_info = trans->fs_info;
3202     struct btrfs_root *extent_root;
3203     struct btrfs_key found;
3204     struct extent_buffer *scratch_leaf = NULL;
3205     struct ulist *roots = NULL;
3206     u64 num_bytes;
3207     bool done;
3208     int slot;
3209     int ret;
3210
3211     mutex_lock(&fs_info->qgroup_rescan_lock);
3212     extent_root = btrfs_extent_root(fs_info,
3213                 fs_info->qgroup_rescan_progress.objectid);
3214     ret = btrfs_search_slot_for_read(extent_root,
3215                      &fs_info->qgroup_rescan_progress,
3216                      path, 1, 0);
3217
3218     btrfs_debug(fs_info,
3219         "current progress key (%llu %u %llu), search_slot ret %d",
3220         fs_info->qgroup_rescan_progress.objectid,
3221         fs_info->qgroup_rescan_progress.type,
3222         fs_info->qgroup_rescan_progress.offset, ret);
3223
3224     if (ret) {
3225         /*
3226          * The rescan is about to end, we will not be scanning any
3227          * further blocks. We cannot unset the RESCAN flag here, because
3228          * we want to commit the transaction if everything went well.
3229          * To make the live accounting work in this phase, we set our
3230          * scan progress pointer such that every real extent objectid
3231          * will be smaller.
3232          */
3233         fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3234         btrfs_release_path(path);
3235         mutex_unlock(&fs_info->qgroup_rescan_lock);
3236         return ret;
3237     }
3238     done = is_last_leaf(path);
3239
3240     btrfs_item_key_to_cpu(path->nodes[0], &found,
3241                   btrfs_header_nritems(path->nodes[0]) - 1);
3242     fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3243
3244     scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3245     if (!scratch_leaf) {
3246         ret = -ENOMEM;
3247         mutex_unlock(&fs_info->qgroup_rescan_lock);
3248         goto out;
3249     }
3250     slot = path->slots[0];
3251     btrfs_release_path(path);
3252     mutex_unlock(&fs_info->qgroup_rescan_lock);
3253
3254     for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3255         btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3256         if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3257             found.type != BTRFS_METADATA_ITEM_KEY)
3258             continue;
3259         if (found.type == BTRFS_METADATA_ITEM_KEY)
3260             num_bytes = fs_info->nodesize;
3261         else
3262             num_bytes = found.offset;
3263
3264         ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
3265                        &roots, false);
3266         if (ret < 0)
3267             goto out;
3268         /* For rescan, just pass old_roots as NULL */
3269         ret = btrfs_qgroup_account_extent(trans, found.objectid,
3270                           num_bytes, NULL, roots);
3271         if (ret < 0)
3272             goto out;
3273     }
3274 out:
3275     if (scratch_leaf)
3276         free_extent_buffer(scratch_leaf);
3277
3278     if (done && !ret) {
3279         ret = 1;
3280         fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3281     }
3282     return ret;
3283 }
3284
3285 static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3286 {
3287     return btrfs_fs_closing(fs_info) ||
3288         test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
3289         !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
3290 }
3291
3292 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3293 {
3294     struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3295                              qgroup_rescan_work);
3296     struct btrfs_path *path;
3297     struct btrfs_trans_handle *trans = NULL;
3298     int err = -ENOMEM;
3299     int ret = 0;
3300     bool stopped = false;
3301
3302     path = btrfs_alloc_path();
3303     if (!path)
3304         goto out;
3305     /*
3306      * Rescan should only search for commit root, and any later difference
3307      * should be recorded by qgroup
3308      */
3309     path->search_commit_root = 1;
3310     path->skip_locking = 1;
3311
3312     err = 0;
3313     while (!err && !(stopped = rescan_should_stop(fs_info))) {
3314         trans = btrfs_start_transaction(fs_info->fs_root, 0);
3315         if (IS_ERR(trans)) {
3316             err = PTR_ERR(trans);
3317             break;
3318         }
3319
3320         err = qgroup_rescan_leaf(trans, path);
3321
3322         if (err > 0)
3323             btrfs_commit_transaction(trans);
3324         else
3325             btrfs_end_transaction(trans);
3326     }
3327
3328 out:
3329     btrfs_free_path(path);
3330
3331     mutex_lock(&fs_info->qgroup_rescan_lock);
3332     if (err > 0 &&
3333         fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3334         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3335     } else if (err < 0 || stopped) {
3336         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3337     }
3338     mutex_unlock(&fs_info->qgroup_rescan_lock);
3339
3340     /*
3341      * only update status, since the previous part has already updated the
3342      * qgroup info.
3343      */
3344     trans = btrfs_start_transaction(fs_info->quota_root, 1);
3345     if (IS_ERR(trans)) {
3346         err = PTR_ERR(trans);
3347         trans = NULL;
3348         btrfs_err(fs_info,
3349               "fail to start transaction for status update: %d",
3350               err);
3351     }
3352
3353     mutex_lock(&fs_info->qgroup_rescan_lock);
3354     if (!stopped)
3355         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3356     if (trans) {
3357         ret = update_qgroup_status_item(trans);
3358         if (ret < 0) {
3359             err = ret;
3360             btrfs_err(fs_info, "fail to update qgroup status: %d",
3361                   err);
3362         }
3363     }
3364     fs_info->qgroup_rescan_running = false;
3365     complete_all(&fs_info->qgroup_rescan_completion);
3366     mutex_unlock(&fs_info->qgroup_rescan_lock);
3367
3368     if (!trans)
3369         return;
3370
3371     btrfs_end_transaction(trans);
3372
3373     if (stopped) {
3374         btrfs_info(fs_info, "qgroup scan paused");
3375     } else if (err >= 0) {
3376         btrfs_info(fs_info, "qgroup scan completed%s",
3377             err > 0 ? " (inconsistency flag cleared)" : "");
3378     } else {
3379         btrfs_err(fs_info, "qgroup scan failed with %d", err);
3380     }
3381 }
3382
3383 /*
3384  * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3385  * memory required for the rescan context.
3386  */
3387 static int
3388 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3389            int init_flags)
3390 {
3391     int ret = 0;
3392
3393     if (!init_flags) {
3394         /* we're resuming qgroup rescan at mount time */
3395         if (!(fs_info->qgroup_flags &
3396               BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3397             btrfs_warn(fs_info,
3398             "qgroup rescan init failed, qgroup rescan is not queued");
3399             ret = -EINVAL;
3400         } else if (!(fs_info->qgroup_flags &
3401                  BTRFS_QGROUP_STATUS_FLAG_ON)) {
3402             btrfs_warn(fs_info,
3403             "qgroup rescan init failed, qgroup is not enabled");
3404             ret = -EINVAL;
3405         }
3406
3407         if (ret)
3408             return ret;
3409     }
3410
3411     mutex_lock(&fs_info->qgroup_rescan_lock);
3412
3413     if (init_flags) {
3414         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3415             btrfs_warn(fs_info,
3416                    "qgroup rescan is already in progress");
3417             ret = -EINPROGRESS;
3418         } else if (!(fs_info->qgroup_flags &
3419                  BTRFS_QGROUP_STATUS_FLAG_ON)) {
3420             btrfs_warn(fs_info,
3421             "qgroup rescan init failed, qgroup is not enabled");
3422             ret = -EINVAL;
3423         } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3424             /* Quota disable is in progress */
3425             ret = -EBUSY;
3426         }
3427
3428         if (ret) {
3429             mutex_unlock(&fs_info->qgroup_rescan_lock);
3430             return ret;
3431         }
3432         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3433     }
3434
3435     memset(&fs_info->qgroup_rescan_progress, 0,
3436         sizeof(fs_info->qgroup_rescan_progress));
3437     fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3438     init_completion(&fs_info->qgroup_rescan_completion);
3439     mutex_unlock(&fs_info->qgroup_rescan_lock);
3440
3441     btrfs_init_work(&fs_info->qgroup_rescan_work,
3442             btrfs_qgroup_rescan_worker, NULL, NULL);
3443     return 0;
3444 }
3445
3446 static void
3447 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3448 {
3449     struct rb_node *n;
3450     struct btrfs_qgroup *qgroup;
3451
3452     spin_lock(&fs_info->qgroup_lock);
3453     /* clear all current qgroup tracking information */
3454     for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3455         qgroup = rb_entry(n, struct btrfs_qgroup, node);
3456         qgroup->rfer = 0;
3457         qgroup->rfer_cmpr = 0;
3458         qgroup->excl = 0;
3459         qgroup->excl_cmpr = 0;
3460         qgroup_dirty(fs_info, qgroup);
3461     }
3462     spin_unlock(&fs_info->qgroup_lock);
3463 }
3464
3465 int
3466 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3467 {
3468     int ret = 0;
3469     struct btrfs_trans_handle *trans;
3470
3471     ret = qgroup_rescan_init(fs_info, 0, 1);
3472     if (ret)
3473         return ret;
3474
3475     /*
3476      * We have set the rescan_progress to 0, which means no more
3477      * delayed refs will be accounted by btrfs_qgroup_account_ref.
3478      * However, btrfs_qgroup_account_ref may be right after its call
3479      * to btrfs_find_all_roots, in which case it would still do the
3480      * accounting.
3481      * To solve this, we're committing the transaction, which will
3482      * ensure we run all delayed refs and only after that, we are
3483      * going to clear all tracking information for a clean start.
3484      */
3485
3486     trans = btrfs_join_transaction(fs_info->fs_root);
3487     if (IS_ERR(trans)) {
3488         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3489         return PTR_ERR(trans);
3490     }
3491     ret = btrfs_commit_transaction(trans);
3492     if (ret) {
3493         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3494         return ret;
3495     }
3496
3497     qgroup_rescan_zero_tracking(fs_info);
3498
3499     mutex_lock(&fs_info->qgroup_rescan_lock);
3500     fs_info->qgroup_rescan_running = true;
3501     btrfs_queue_work(fs_info->qgroup_rescan_workers,
3502              &fs_info->qgroup_rescan_work);
3503     mutex_unlock(&fs_info->qgroup_rescan_lock);
3504
3505     return 0;
3506 }
3507
3508 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3509                      bool interruptible)
3510 {
3511     int running;
3512     int ret = 0;
3513
3514     mutex_lock(&fs_info->qgroup_rescan_lock);
3515     running = fs_info->qgroup_rescan_running;
3516     mutex_unlock(&fs_info->qgroup_rescan_lock);
3517
3518     if (!running)
3519         return 0;
3520
3521     if (interruptible)
3522         ret = wait_for_completion_interruptible(
3523                     &fs_info->qgroup_rescan_completion);
3524     else
3525         wait_for_completion(&fs_info->qgroup_rescan_completion);
3526
3527     return ret;
3528 }
3529
3530 /*
3531  * this is only called from open_ctree where we're still single threaded, thus
3532  * locking is omitted here.
3533  */
3534 void
3535 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3536 {
3537     if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3538         mutex_lock(&fs_info->qgroup_rescan_lock);
3539         fs_info->qgroup_rescan_running = true;
3540         btrfs_queue_work(fs_info->qgroup_rescan_workers,
3541                  &fs_info->qgroup_rescan_work);
3542         mutex_unlock(&fs_info->qgroup_rescan_lock);
3543     }
3544 }
3545
3546 #define rbtree_iterate_from_safe(node, next, start)             \
3547        for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3548
3549 static int qgroup_unreserve_range(struct btrfs_inode *inode,
3550                   struct extent_changeset *reserved, u64 start,
3551                   u64 len)
3552 {
3553     struct rb_node *node;
3554     struct rb_node *next;
3555     struct ulist_node *entry;
3556     int ret = 0;
3557
3558     node = reserved->range_changed.root.rb_node;
3559     if (!node)
3560         return 0;
3561     while (node) {
3562         entry = rb_entry(node, struct ulist_node, rb_node);
3563         if (entry->val < start)
3564             node = node->rb_right;
3565         else
3566             node = node->rb_left;
3567     }
3568
3569     if (entry->val > start && rb_prev(&entry->rb_node))
3570         entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3571                  rb_node);
3572
3573     rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3574         u64 entry_start;
3575         u64 entry_end;
3576         u64 entry_len;
3577         int clear_ret;
3578
3579         entry = rb_entry(node, struct ulist_node, rb_node);
3580         entry_start = entry->val;
3581         entry_end = entry->aux;
3582         entry_len = entry_end - entry_start + 1;
3583
3584         if (entry_start >= start + len)
3585             break;
3586         if (entry_start + entry_len <= start)
3587             continue;
3588         /*
3589          * Now the entry is in [start, start + len), revert the
3590          * EXTENT_QGROUP_RESERVED bit.
3591          */
3592         clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3593                           entry_end, EXTENT_QGROUP_RESERVED);
3594         if (!ret && clear_ret < 0)
3595             ret = clear_ret;
3596
3597         ulist_del(&reserved->range_changed, entry->val, entry->aux);
3598         if (likely(reserved->bytes_changed >= entry_len)) {
3599             reserved->bytes_changed -= entry_len;
3600         } else {
3601             WARN_ON(1);
3602             reserved->bytes_changed = 0;
3603         }
3604     }
3605
3606     return ret;
3607 }
3608
3609 /*
3610  * Try to free some space for qgroup.
3611  *
3612  * For qgroup, there are only 3 ways to free qgroup space:
3613  * - Flush nodatacow write
3614  *   Any nodatacow write will free its reserved data space at run_delalloc_range().
3615  *   In theory, we should only flush nodatacow inodes, but it's not yet
3616  *   possible, so we need to flush the whole root.
3617  *
3618  * - Wait for ordered extents
3619  *   When ordered extents are finished, their reserved metadata is finally
3620  *   converted to per_trans status, which can be freed by later commit
3621  *   transaction.
3622  *
3623  * - Commit transaction
3624  *   This would free the meta_per_trans space.
3625  *   In theory this shouldn't provide much space, but any more qgroup space
3626  *   is needed.
3627  */
3628 static int try_flush_qgroup(struct btrfs_root *root)
3629 {
3630     struct btrfs_trans_handle *trans;
3631     int ret;
3632
3633     /* Can't hold an open transaction or we run the risk of deadlocking. */
3634     ASSERT(current->journal_info == NULL);
3635     if (WARN_ON(current->journal_info))
3636         return 0;
3637
3638     /*
3639      * We don't want to run flush again and again, so if there is a running
3640      * one, we won't try to start a new flush, but exit directly.
3641      */
3642     if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3643         wait_event(root->qgroup_flush_wait,
3644             !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3645         return 0;
3646     }
3647
3648     ret = btrfs_start_delalloc_snapshot(root, true);
3649     if (ret < 0)
3650         goto out;
3651     btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3652
3653     trans = btrfs_join_transaction(root);
3654     if (IS_ERR(trans)) {
3655         ret = PTR_ERR(trans);
3656         goto out;
3657     }
3658
3659     ret = btrfs_commit_transaction(trans);
3660 out:
3661     clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3662     wake_up(&root->qgroup_flush_wait);
3663     return ret;
3664 }
3665
3666 static int qgroup_reserve_data(struct btrfs_inode *inode,
3667             struct extent_changeset **reserved_ret, u64 start,
3668             u64 len)
3669 {
3670     struct btrfs_root *root = inode->root;
3671     struct extent_changeset *reserved;
3672     bool new_reserved = false;
3673     u64 orig_reserved;
3674     u64 to_reserve;
3675     int ret;
3676
3677     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3678         !is_fstree(root->root_key.objectid) || len == 0)
3679         return 0;
3680
3681     /* @reserved parameter is mandatory for qgroup */
3682     if (WARN_ON(!reserved_ret))
3683         return -EINVAL;
3684     if (!*reserved_ret) {
3685         new_reserved = true;
3686         *reserved_ret = extent_changeset_alloc();
3687         if (!*reserved_ret)
3688             return -ENOMEM;
3689     }
3690     reserved = *reserved_ret;
3691     /* Record already reserved space */
3692     orig_reserved = reserved->bytes_changed;
3693     ret = set_record_extent_bits(&inode->io_tree, start,
3694             start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3695
3696     /* Newly reserved space */
3697     to_reserve = reserved->bytes_changed - orig_reserved;
3698     trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
3699                     to_reserve, QGROUP_RESERVE);
3700     if (ret < 0)
3701         goto out;
3702     ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3703     if (ret < 0)
3704         goto cleanup;
3705
3706     return ret;
3707
3708 cleanup:
3709     qgroup_unreserve_range(inode, reserved, start, len);
3710 out:
3711     if (new_reserved) {
3712         extent_changeset_free(reserved);
3713         *reserved_ret = NULL;
3714     }
3715     return ret;
3716 }
3717
3718 /*
3719  * Reserve qgroup space for range [start, start + len).
3720  *
3721  * This function will either reserve space from related qgroups or do nothing
3722  * if the range is already reserved.
3723  *
3724  * Return 0 for successful reservation
3725  * Return <0 for error (including -EQUOT)
3726  *
3727  * NOTE: This function may sleep for memory allocation, dirty page flushing and
3728  *   commit transaction. So caller should not hold any dirty page locked.
3729  */
3730 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3731             struct extent_changeset **reserved_ret, u64 start,
3732             u64 len)
3733 {
3734     int ret;
3735
3736     ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3737     if (ret <= 0 && ret != -EDQUOT)
3738         return ret;
3739
3740     ret = try_flush_qgroup(inode->root);
3741     if (ret < 0)
3742         return ret;
3743     return qgroup_reserve_data(inode, reserved_ret, start, len);
3744 }
3745
3746 /* Free ranges specified by @reserved, normally in error path */
3747 static int qgroup_free_reserved_data(struct btrfs_inode *inode,
3748             struct extent_changeset *reserved, u64 start, u64 len)
3749 {
3750     struct btrfs_root *root = inode->root;
3751     struct ulist_node *unode;
3752     struct ulist_iterator uiter;
3753     struct extent_changeset changeset;
3754     int freed = 0;
3755     int ret;
3756
3757     extent_changeset_init(&changeset);
3758     len = round_up(start + len, root->fs_info->sectorsize);
3759     start = round_down(start, root->fs_info->sectorsize);
3760
3761     ULIST_ITER_INIT(&uiter);
3762     while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3763         u64 range_start = unode->val;
3764         /* unode->aux is the inclusive end */
3765         u64 range_len = unode->aux - range_start + 1;
3766         u64 free_start;
3767         u64 free_len;
3768
3769         extent_changeset_release(&changeset);
3770
3771         /* Only free range in range [start, start + len) */
3772         if (range_start >= start + len ||
3773             range_start + range_len <= start)
3774             continue;
3775         free_start = max(range_start, start);
3776         free_len = min(start + len, range_start + range_len) -
3777                free_start;
3778         /*
3779          * TODO: To also modify reserved->ranges_reserved to reflect
3780          * the modification.
3781          *
3782          * However as long as we free qgroup reserved according to
3783          * EXTENT_QGROUP_RESERVED, we won't double free.
3784          * So not need to rush.
3785          */
3786         ret = clear_record_extent_bits(&inode->io_tree, free_start,
3787                 free_start + free_len - 1,
3788                 EXTENT_QGROUP_RESERVED, &changeset);
3789         if (ret < 0)
3790             goto out;
3791         freed += changeset.bytes_changed;
3792     }
3793     btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3794                   BTRFS_QGROUP_RSV_DATA);
3795     ret = freed;
3796 out:
3797     extent_changeset_release(&changeset);
3798     return ret;
3799 }
3800
3801 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
3802             struct extent_changeset *reserved, u64 start, u64 len,
3803             int free)
3804 {
3805     struct extent_changeset changeset;
3806     int trace_op = QGROUP_RELEASE;
3807     int ret;
3808
3809     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
3810         return 0;
3811
3812     /* In release case, we shouldn't have @reserved */
3813     WARN_ON(!free && reserved);
3814     if (free && reserved)
3815         return qgroup_free_reserved_data(inode, reserved, start, len);
3816     extent_changeset_init(&changeset);
3817     ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3818                        EXTENT_QGROUP_RESERVED, &changeset);
3819     if (ret < 0)
3820         goto out;
3821
3822     if (free)
3823         trace_op = QGROUP_FREE;
3824     trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
3825                     changeset.bytes_changed, trace_op);
3826     if (free)
3827         btrfs_qgroup_free_refroot(inode->root->fs_info,
3828                 inode->root->root_key.objectid,
3829                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3830     ret = changeset.bytes_changed;
3831 out:
3832     extent_changeset_release(&changeset);
3833     return ret;
3834 }
3835
3836 /*
3837  * Free a reserved space range from io_tree and related qgroups
3838  *
3839  * Should be called when a range of pages get invalidated before reaching disk.
3840  * Or for error cleanup case.
3841  * if @reserved is given, only reserved range in [@start, @start + @len) will
3842  * be freed.
3843  *
3844  * For data written to disk, use btrfs_qgroup_release_data().
3845  *
3846  * NOTE: This function may sleep for memory allocation.
3847  */
3848 int btrfs_qgroup_free_data(struct btrfs_inode *inode,
3849             struct extent_changeset *reserved, u64 start, u64 len)
3850 {
3851     return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
3852 }
3853
3854 /*
3855  * Release a reserved space range from io_tree only.
3856  *
3857  * Should be called when a range of pages get written to disk and corresponding
3858  * FILE_EXTENT is inserted into corresponding root.
3859  *
3860  * Since new qgroup accounting framework will only update qgroup numbers at
3861  * commit_transaction() time, its reserved space shouldn't be freed from
3862  * related qgroups.
3863  *
3864  * But we should release the range from io_tree, to allow further write to be
3865  * COWed.
3866  *
3867  * NOTE: This function may sleep for memory allocation.
3868  */
3869 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
3870 {
3871     return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
3872 }
3873
3874 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3875                   enum btrfs_qgroup_rsv_type type)
3876 {
3877     if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3878         type != BTRFS_QGROUP_RSV_META_PERTRANS)
3879         return;
3880     if (num_bytes == 0)
3881         return;
3882
3883     spin_lock(&root->qgroup_meta_rsv_lock);
3884     if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
3885         root->qgroup_meta_rsv_prealloc += num_bytes;
3886     else
3887         root->qgroup_meta_rsv_pertrans += num_bytes;
3888     spin_unlock(&root->qgroup_meta_rsv_lock);
3889 }
3890
3891 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3892                  enum btrfs_qgroup_rsv_type type)
3893 {
3894     if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3895         type != BTRFS_QGROUP_RSV_META_PERTRANS)
3896         return 0;
3897     if (num_bytes == 0)
3898         return 0;
3899
3900     spin_lock(&root->qgroup_meta_rsv_lock);
3901     if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
3902         num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
3903                   num_bytes);
3904         root->qgroup_meta_rsv_prealloc -= num_bytes;
3905     } else {
3906         num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
3907                   num_bytes);
3908         root->qgroup_meta_rsv_pertrans -= num_bytes;
3909     }
3910     spin_unlock(&root->qgroup_meta_rsv_lock);
3911     return num_bytes;
3912 }
3913
3914 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3915                   enum btrfs_qgroup_rsv_type type, bool enforce)
3916 {
3917     struct btrfs_fs_info *fs_info = root->fs_info;
3918     int ret;
3919
3920     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3921         !is_fstree(root->root_key.objectid) || num_bytes == 0)
3922         return 0;
3923
3924     BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3925     trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
3926     ret = qgroup_reserve(root, num_bytes, enforce, type);
3927     if (ret < 0)
3928         return ret;
3929     /*
3930      * Record what we have reserved into root.
3931      *
3932      * To avoid quota disabled->enabled underflow.
3933      * In that case, we may try to free space we haven't reserved
3934      * (since quota was disabled), so record what we reserved into root.
3935      * And ensure later release won't underflow this number.
3936      */
3937     add_root_meta_rsv(root, num_bytes, type);
3938     return ret;
3939 }
3940
3941 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3942                 enum btrfs_qgroup_rsv_type type, bool enforce,
3943                 bool noflush)
3944 {
3945     int ret;
3946
3947     ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3948     if ((ret <= 0 && ret != -EDQUOT) || noflush)
3949         return ret;
3950
3951     ret = try_flush_qgroup(root);
3952     if (ret < 0)
3953         return ret;
3954     return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3955 }
3956
3957 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
3958 {
3959     struct btrfs_fs_info *fs_info = root->fs_info;
3960
3961     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3962         !is_fstree(root->root_key.objectid))
3963         return;
3964
3965     /* TODO: Update trace point to handle such free */
3966     trace_qgroup_meta_free_all_pertrans(root);
3967     /* Special value -1 means to free all reserved space */
3968     btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
3969                   BTRFS_QGROUP_RSV_META_PERTRANS);
3970 }
3971
3972 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
3973                   enum btrfs_qgroup_rsv_type type)
3974 {
3975     struct btrfs_fs_info *fs_info = root->fs_info;
3976
3977     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3978         !is_fstree(root->root_key.objectid))
3979         return;
3980
3981     /*
3982      * reservation for META_PREALLOC can happen before quota is enabled,
3983      * which can lead to underflow.
3984      * Here ensure we will only free what we really have reserved.
3985      */
3986     num_bytes = sub_root_meta_rsv(root, num_bytes, type);
3987     BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3988     trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
3989     btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
3990                   num_bytes, type);
3991 }
3992
3993 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
3994                 int num_bytes)
3995 {
3996     struct btrfs_qgroup *qgroup;
3997     struct ulist_node *unode;
3998     struct ulist_iterator uiter;
3999     int ret = 0;
4000
4001     if (num_bytes == 0)
4002         return;
4003     if (!fs_info->quota_root)
4004         return;
4005
4006     spin_lock(&fs_info->qgroup_lock);
4007     qgroup = find_qgroup_rb(fs_info, ref_root);
4008     if (!qgroup)
4009         goto out;
4010     ulist_reinit(fs_info->qgroup_ulist);
4011     ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
4012                qgroup_to_aux(qgroup), GFP_ATOMIC);
4013     if (ret < 0)
4014         goto out;
4015     ULIST_ITER_INIT(&uiter);
4016     while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
4017         struct btrfs_qgroup *qg;
4018         struct btrfs_qgroup_list *glist;
4019
4020         qg = unode_aux_to_qgroup(unode);
4021
4022         qgroup_rsv_release(fs_info, qg, num_bytes,
4023                 BTRFS_QGROUP_RSV_META_PREALLOC);
4024         qgroup_rsv_add(fs_info, qg, num_bytes,
4025                 BTRFS_QGROUP_RSV_META_PERTRANS);
4026         list_for_each_entry(glist, &qg->groups, next_group) {
4027             ret = ulist_add(fs_info->qgroup_ulist,
4028                     glist->group->qgroupid,
4029                     qgroup_to_aux(glist->group), GFP_ATOMIC);
4030             if (ret < 0)
4031                 goto out;
4032         }
4033     }
4034 out:
4035     spin_unlock(&fs_info->qgroup_lock);
4036 }
4037
4038 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
4039 {
4040     struct btrfs_fs_info *fs_info = root->fs_info;
4041
4042     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4043         !is_fstree(root->root_key.objectid))
4044         return;
4045     /* Same as btrfs_qgroup_free_meta_prealloc() */
4046     num_bytes = sub_root_meta_rsv(root, num_bytes,
4047                       BTRFS_QGROUP_RSV_META_PREALLOC);
4048     trace_qgroup_meta_convert(root, num_bytes);
4049     qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
4050 }
4051
4052 /*
4053  * Check qgroup reserved space leaking, normally at destroy inode
4054  * time
4055  */
4056 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4057 {
4058     struct extent_changeset changeset;
4059     struct ulist_node *unode;
4060     struct ulist_iterator iter;
4061     int ret;
4062
4063     extent_changeset_init(&changeset);
4064     ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4065             EXTENT_QGROUP_RESERVED, &changeset);
4066
4067     WARN_ON(ret < 0);
4068     if (WARN_ON(changeset.bytes_changed)) {
4069         ULIST_ITER_INIT(&iter);
4070         while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4071             btrfs_warn(inode->root->fs_info,
4072         "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4073                 btrfs_ino(inode), unode->val, unode->aux);
4074         }
4075         btrfs_qgroup_free_refroot(inode->root->fs_info,
4076                 inode->root->root_key.objectid,
4077                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4078
4079     }
4080     extent_changeset_release(&changeset);
4081 }
4082
4083 void btrfs_qgroup_init_swapped_blocks(
4084     struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4085 {
4086     int i;
4087
4088     spin_lock_init(&swapped_blocks->lock);
4089     for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4090         swapped_blocks->blocks[i] = RB_ROOT;
4091     swapped_blocks->swapped = false;
4092 }
4093
4094 /*
4095  * Delete all swapped blocks record of @root.
4096  * Every record here means we skipped a full subtree scan for qgroup.
4097  *
4098  * Gets called when committing one transaction.
4099  */
4100 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4101 {
4102     struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4103     int i;
4104
4105     swapped_blocks = &root->swapped_blocks;
4106
4107     spin_lock(&swapped_blocks->lock);
4108     if (!swapped_blocks->swapped)
4109         goto out;
4110     for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4111         struct rb_root *cur_root = &swapped_blocks->blocks[i];
4112         struct btrfs_qgroup_swapped_block *entry;
4113         struct btrfs_qgroup_swapped_block *next;
4114
4115         rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4116                              node)
4117             kfree(entry);
4118         swapped_blocks->blocks[i] = RB_ROOT;
4119     }
4120     swapped_blocks->swapped = false;
4121 out:
4122     spin_unlock(&swapped_blocks->lock);
4123 }
4124
4125 /*
4126  * Add subtree roots record into @subvol_root.
4127  *
4128  * @subvol_root:    tree root of the subvolume tree get swapped
4129  * @bg:         block group under balance
4130  * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4131  * @reloc_parent/slot:  pointer to the subtree root in reloc tree
4132  *          BOTH POINTERS ARE BEFORE TREE SWAP
4133  * @last_snapshot:  last snapshot generation of the subvolume tree
4134  */
4135 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4136         struct btrfs_root *subvol_root,
4137         struct btrfs_block_group *bg,
4138         struct extent_buffer *subvol_parent, int subvol_slot,
4139         struct extent_buffer *reloc_parent, int reloc_slot,
4140         u64 last_snapshot)
4141 {
4142     struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4143     struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4144     struct btrfs_qgroup_swapped_block *block;
4145     struct rb_node **cur;
4146     struct rb_node *parent = NULL;
4147     int level = btrfs_header_level(subvol_parent) - 1;
4148     int ret = 0;
4149
4150     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4151         return 0;
4152
4153     if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4154         btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4155         btrfs_err_rl(fs_info,
4156         "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4157             __func__,
4158             btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4159             btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4160         return -EUCLEAN;
4161     }
4162
4163     block = kmalloc(sizeof(*block), GFP_NOFS);
4164     if (!block) {
4165         ret = -ENOMEM;
4166         goto out;
4167     }
4168
4169     /*
4170      * @reloc_parent/slot is still before swap, while @block is going to
4171      * record the bytenr after swap, so we do the swap here.
4172      */
4173     block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4174     block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4175                                  reloc_slot);
4176     block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4177     block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4178                                 subvol_slot);
4179     block->last_snapshot = last_snapshot;
4180     block->level = level;
4181
4182     /*
4183      * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4184      * no one else can modify tree blocks thus we qgroup will not change
4185      * no matter the value of trace_leaf.
4186      */
4187     if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4188         block->trace_leaf = true;
4189     else
4190         block->trace_leaf = false;
4191     btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4192
4193     /* Insert @block into @blocks */
4194     spin_lock(&blocks->lock);
4195     cur = &blocks->blocks[level].rb_node;
4196     while (*cur) {
4197         struct btrfs_qgroup_swapped_block *entry;
4198
4199         parent = *cur;
4200         entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4201                  node);
4202
4203         if (entry->subvol_bytenr < block->subvol_bytenr) {
4204             cur = &(*cur)->rb_left;
4205         } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4206             cur = &(*cur)->rb_right;
4207         } else {
4208             if (entry->subvol_generation !=
4209                     block->subvol_generation ||
4210                 entry->reloc_bytenr != block->reloc_bytenr ||
4211                 entry->reloc_generation !=
4212                     block->reloc_generation) {
4213                 /*
4214                  * Duplicated but mismatch entry found.
4215                  * Shouldn't happen.
4216                  *
4217                  * Marking qgroup inconsistent should be enough
4218                  * for end users.
4219                  */
4220                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4221                 ret = -EEXIST;
4222             }
4223             kfree(block);
4224             goto out_unlock;
4225         }
4226     }
4227     rb_link_node(&block->node, parent, cur);
4228     rb_insert_color(&block->node, &blocks->blocks[level]);
4229     blocks->swapped = true;
4230 out_unlock:
4231     spin_unlock(&blocks->lock);
4232 out:
4233     if (ret < 0)
4234         fs_info->qgroup_flags |=
4235             BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4236     return ret;
4237 }
4238
4239 /*
4240  * Check if the tree block is a subtree root, and if so do the needed
4241  * delayed subtree trace for qgroup.
4242  *
4243  * This is called during btrfs_cow_block().
4244  */
4245 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4246                      struct btrfs_root *root,
4247                      struct extent_buffer *subvol_eb)
4248 {
4249     struct btrfs_fs_info *fs_info = root->fs_info;
4250     struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4251     struct btrfs_qgroup_swapped_block *block;
4252     struct extent_buffer *reloc_eb = NULL;
4253     struct rb_node *node;
4254     bool found = false;
4255     bool swapped = false;
4256     int level = btrfs_header_level(subvol_eb);
4257     int ret = 0;
4258     int i;
4259
4260     if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4261         return 0;
4262     if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4263         return 0;
4264
4265     spin_lock(&blocks->lock);
4266     if (!blocks->swapped) {
4267         spin_unlock(&blocks->lock);
4268         return 0;
4269     }
4270     node = blocks->blocks[level].rb_node;
4271
4272     while (node) {
4273         block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4274         if (block->subvol_bytenr < subvol_eb->start) {
4275             node = node->rb_left;
4276         } else if (block->subvol_bytenr > subvol_eb->start) {
4277             node = node->rb_right;
4278         } else {
4279             found = true;
4280             break;
4281         }
4282     }
4283     if (!found) {
4284         spin_unlock(&blocks->lock);
4285         goto out;
4286     }
4287     /* Found one, remove it from @blocks first and update blocks->swapped */
4288     rb_erase(&block->node, &blocks->blocks[level]);
4289     for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4290         if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4291             swapped = true;
4292             break;
4293         }
4294     }
4295     blocks->swapped = swapped;
4296     spin_unlock(&blocks->lock);
4297
4298     /* Read out reloc subtree root */
4299     reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
4300                    block->reloc_generation, block->level,
4301                    &block->first_key);
4302     if (IS_ERR(reloc_eb)) {
4303         ret = PTR_ERR(reloc_eb);
4304         reloc_eb = NULL;
4305         goto free_out;
4306     }
4307     if (!extent_buffer_uptodate(reloc_eb)) {
4308         ret = -EIO;
4309         goto free_out;
4310     }
4311
4312     ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4313             block->last_snapshot, block->trace_leaf);
4314 free_out:
4315     kfree(block);
4316     free_extent_buffer(reloc_eb);
4317 out:
4318     if (ret < 0) {
4319         btrfs_err_rl(fs_info,
4320                  "failed to account subtree at bytenr %llu: %d",
4321                  subvol_eb->start, ret);
4322         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4323     }
4324     return ret;
4325 }
4326
4327 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4328 {
4329     struct btrfs_qgroup_extent_record *entry;
4330     struct btrfs_qgroup_extent_record *next;
4331     struct rb_root *root;
4332
4333     root = &trans->delayed_refs.dirty_extent_root;
4334     rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4335         ulist_free(entry->old_roots);
4336         kfree(entry);
4337     }
4338 }