0001
0002
0003
0004
0005
0006 #include <linux/sched.h>
0007 #include <linux/sched/mm.h>
0008 #include <linux/bio.h>
0009 #include <linux/slab.h>
0010 #include <linux/blkdev.h>
0011 #include <linux/ratelimit.h>
0012 #include <linux/kthread.h>
0013 #include <linux/raid/pq.h>
0014 #include <linux/semaphore.h>
0015 #include <linux/uuid.h>
0016 #include <linux/list_sort.h>
0017 #include <linux/namei.h>
0018 #include "misc.h"
0019 #include "ctree.h"
0020 #include "extent_map.h"
0021 #include "disk-io.h"
0022 #include "transaction.h"
0023 #include "print-tree.h"
0024 #include "volumes.h"
0025 #include "raid56.h"
0026 #include "async-thread.h"
0027 #include "check-integrity.h"
0028 #include "rcu-string.h"
0029 #include "dev-replace.h"
0030 #include "sysfs.h"
0031 #include "tree-checker.h"
0032 #include "space-info.h"
0033 #include "block-group.h"
0034 #include "discard.h"
0035 #include "zoned.h"
0036
0037 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
0038 BTRFS_BLOCK_GROUP_RAID10 | \
0039 BTRFS_BLOCK_GROUP_RAID56_MASK)
0040
0041 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
0042 [BTRFS_RAID_RAID10] = {
0043 .sub_stripes = 2,
0044 .dev_stripes = 1,
0045 .devs_max = 0,
0046 .devs_min = 2,
0047 .tolerated_failures = 1,
0048 .devs_increment = 2,
0049 .ncopies = 2,
0050 .nparity = 0,
0051 .raid_name = "raid10",
0052 .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
0053 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
0054 },
0055 [BTRFS_RAID_RAID1] = {
0056 .sub_stripes = 1,
0057 .dev_stripes = 1,
0058 .devs_max = 2,
0059 .devs_min = 2,
0060 .tolerated_failures = 1,
0061 .devs_increment = 2,
0062 .ncopies = 2,
0063 .nparity = 0,
0064 .raid_name = "raid1",
0065 .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
0066 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
0067 },
0068 [BTRFS_RAID_RAID1C3] = {
0069 .sub_stripes = 1,
0070 .dev_stripes = 1,
0071 .devs_max = 3,
0072 .devs_min = 3,
0073 .tolerated_failures = 2,
0074 .devs_increment = 3,
0075 .ncopies = 3,
0076 .nparity = 0,
0077 .raid_name = "raid1c3",
0078 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
0079 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
0080 },
0081 [BTRFS_RAID_RAID1C4] = {
0082 .sub_stripes = 1,
0083 .dev_stripes = 1,
0084 .devs_max = 4,
0085 .devs_min = 4,
0086 .tolerated_failures = 3,
0087 .devs_increment = 4,
0088 .ncopies = 4,
0089 .nparity = 0,
0090 .raid_name = "raid1c4",
0091 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
0092 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
0093 },
0094 [BTRFS_RAID_DUP] = {
0095 .sub_stripes = 1,
0096 .dev_stripes = 2,
0097 .devs_max = 1,
0098 .devs_min = 1,
0099 .tolerated_failures = 0,
0100 .devs_increment = 1,
0101 .ncopies = 2,
0102 .nparity = 0,
0103 .raid_name = "dup",
0104 .bg_flag = BTRFS_BLOCK_GROUP_DUP,
0105 .mindev_error = 0,
0106 },
0107 [BTRFS_RAID_RAID0] = {
0108 .sub_stripes = 1,
0109 .dev_stripes = 1,
0110 .devs_max = 0,
0111 .devs_min = 1,
0112 .tolerated_failures = 0,
0113 .devs_increment = 1,
0114 .ncopies = 1,
0115 .nparity = 0,
0116 .raid_name = "raid0",
0117 .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
0118 .mindev_error = 0,
0119 },
0120 [BTRFS_RAID_SINGLE] = {
0121 .sub_stripes = 1,
0122 .dev_stripes = 1,
0123 .devs_max = 1,
0124 .devs_min = 1,
0125 .tolerated_failures = 0,
0126 .devs_increment = 1,
0127 .ncopies = 1,
0128 .nparity = 0,
0129 .raid_name = "single",
0130 .bg_flag = 0,
0131 .mindev_error = 0,
0132 },
0133 [BTRFS_RAID_RAID5] = {
0134 .sub_stripes = 1,
0135 .dev_stripes = 1,
0136 .devs_max = 0,
0137 .devs_min = 2,
0138 .tolerated_failures = 1,
0139 .devs_increment = 1,
0140 .ncopies = 1,
0141 .nparity = 1,
0142 .raid_name = "raid5",
0143 .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
0144 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
0145 },
0146 [BTRFS_RAID_RAID6] = {
0147 .sub_stripes = 1,
0148 .dev_stripes = 1,
0149 .devs_max = 0,
0150 .devs_min = 3,
0151 .tolerated_failures = 2,
0152 .devs_increment = 1,
0153 .ncopies = 1,
0154 .nparity = 2,
0155 .raid_name = "raid6",
0156 .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
0157 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
0158 },
0159 };
0160
0161
0162
0163
0164
0165 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
0166 {
0167 const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
0168
0169 if (!profile)
0170 return BTRFS_RAID_SINGLE;
0171
0172 return BTRFS_BG_FLAG_TO_INDEX(profile);
0173 }
0174
0175 const char *btrfs_bg_type_to_raid_name(u64 flags)
0176 {
0177 const int index = btrfs_bg_flags_to_raid_index(flags);
0178
0179 if (index >= BTRFS_NR_RAID_TYPES)
0180 return NULL;
0181
0182 return btrfs_raid_array[index].raid_name;
0183 }
0184
0185 int btrfs_nr_parity_stripes(u64 type)
0186 {
0187 enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
0188
0189 return btrfs_raid_array[index].nparity;
0190 }
0191
0192
0193
0194
0195
0196 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
0197 {
0198 int i;
0199 int ret;
0200 char *bp = buf;
0201 u64 flags = bg_flags;
0202 u32 size_bp = size_buf;
0203
0204 if (!flags) {
0205 strcpy(bp, "NONE");
0206 return;
0207 }
0208
0209 #define DESCRIBE_FLAG(flag, desc) \
0210 do { \
0211 if (flags & (flag)) { \
0212 ret = snprintf(bp, size_bp, "%s|", (desc)); \
0213 if (ret < 0 || ret >= size_bp) \
0214 goto out_overflow; \
0215 size_bp -= ret; \
0216 bp += ret; \
0217 flags &= ~(flag); \
0218 } \
0219 } while (0)
0220
0221 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
0222 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
0223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
0224
0225 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
0226 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
0227 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
0228 btrfs_raid_array[i].raid_name);
0229 #undef DESCRIBE_FLAG
0230
0231 if (flags) {
0232 ret = snprintf(bp, size_bp, "0x%llx|", flags);
0233 size_bp -= ret;
0234 }
0235
0236 if (size_bp < size_buf)
0237 buf[size_buf - size_bp - 1] = '\0';
0238
0239
0240
0241
0242
0243 out_overflow:;
0244 }
0245
0246 static int init_first_rw_device(struct btrfs_trans_handle *trans);
0247 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
0248 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
0249 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
0250 enum btrfs_map_op op,
0251 u64 logical, u64 *length,
0252 struct btrfs_io_context **bioc_ret,
0253 int mirror_num, int need_raid_map);
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283
0284
0285
0286
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355 DEFINE_MUTEX(uuid_mutex);
0356 static LIST_HEAD(fs_uuids);
0357 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
0358 {
0359 return &fs_uuids;
0360 }
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
0372 const u8 *metadata_fsid)
0373 {
0374 struct btrfs_fs_devices *fs_devs;
0375
0376 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
0377 if (!fs_devs)
0378 return ERR_PTR(-ENOMEM);
0379
0380 mutex_init(&fs_devs->device_list_mutex);
0381
0382 INIT_LIST_HEAD(&fs_devs->devices);
0383 INIT_LIST_HEAD(&fs_devs->alloc_list);
0384 INIT_LIST_HEAD(&fs_devs->fs_list);
0385 INIT_LIST_HEAD(&fs_devs->seed_list);
0386 if (fsid)
0387 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
0388
0389 if (metadata_fsid)
0390 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
0391 else if (fsid)
0392 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
0393
0394 return fs_devs;
0395 }
0396
0397 void btrfs_free_device(struct btrfs_device *device)
0398 {
0399 WARN_ON(!list_empty(&device->post_commit_list));
0400 rcu_string_free(device->name);
0401 extent_io_tree_release(&device->alloc_state);
0402 btrfs_destroy_dev_zone_info(device);
0403 kfree(device);
0404 }
0405
0406 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
0407 {
0408 struct btrfs_device *device;
0409 WARN_ON(fs_devices->opened);
0410 while (!list_empty(&fs_devices->devices)) {
0411 device = list_entry(fs_devices->devices.next,
0412 struct btrfs_device, dev_list);
0413 list_del(&device->dev_list);
0414 btrfs_free_device(device);
0415 }
0416 kfree(fs_devices);
0417 }
0418
0419 void __exit btrfs_cleanup_fs_uuids(void)
0420 {
0421 struct btrfs_fs_devices *fs_devices;
0422
0423 while (!list_empty(&fs_uuids)) {
0424 fs_devices = list_entry(fs_uuids.next,
0425 struct btrfs_fs_devices, fs_list);
0426 list_del(&fs_devices->fs_list);
0427 free_fs_devices(fs_devices);
0428 }
0429 }
0430
0431 static noinline struct btrfs_fs_devices *find_fsid(
0432 const u8 *fsid, const u8 *metadata_fsid)
0433 {
0434 struct btrfs_fs_devices *fs_devices;
0435
0436 ASSERT(fsid);
0437
0438
0439 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
0440 if (metadata_fsid) {
0441 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
0442 && memcmp(metadata_fsid, fs_devices->metadata_uuid,
0443 BTRFS_FSID_SIZE) == 0)
0444 return fs_devices;
0445 } else {
0446 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
0447 return fs_devices;
0448 }
0449 }
0450 return NULL;
0451 }
0452
0453 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
0454 struct btrfs_super_block *disk_super)
0455 {
0456
0457 struct btrfs_fs_devices *fs_devices;
0458
0459
0460
0461
0462
0463
0464
0465 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
0466 if (fs_devices->fsid_change &&
0467 memcmp(disk_super->metadata_uuid, fs_devices->fsid,
0468 BTRFS_FSID_SIZE) == 0 &&
0469 memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
0470 BTRFS_FSID_SIZE) == 0) {
0471 return fs_devices;
0472 }
0473 }
0474
0475
0476
0477
0478
0479
0480 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
0481 if (fs_devices->fsid_change &&
0482 memcmp(fs_devices->metadata_uuid,
0483 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
0484 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
0485 BTRFS_FSID_SIZE) == 0) {
0486 return fs_devices;
0487 }
0488 }
0489
0490 return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
0491 }
0492
0493
0494 static int
0495 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
0496 int flush, struct block_device **bdev,
0497 struct btrfs_super_block **disk_super)
0498 {
0499 int ret;
0500
0501 *bdev = blkdev_get_by_path(device_path, flags, holder);
0502
0503 if (IS_ERR(*bdev)) {
0504 ret = PTR_ERR(*bdev);
0505 goto error;
0506 }
0507
0508 if (flush)
0509 sync_blockdev(*bdev);
0510 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
0511 if (ret) {
0512 blkdev_put(*bdev, flags);
0513 goto error;
0514 }
0515 invalidate_bdev(*bdev);
0516 *disk_super = btrfs_read_dev_super(*bdev);
0517 if (IS_ERR(*disk_super)) {
0518 ret = PTR_ERR(*disk_super);
0519 blkdev_put(*bdev, flags);
0520 goto error;
0521 }
0522
0523 return 0;
0524
0525 error:
0526 *bdev = NULL;
0527 return ret;
0528 }
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538
0539
0540
0541
0542
0543 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
0544 {
0545 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
0546 struct btrfs_device *device, *tmp_device;
0547 int ret = 0;
0548
0549 lockdep_assert_held(&uuid_mutex);
0550
0551 if (devt)
0552 ret = -ENOENT;
0553
0554 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
0555
0556 mutex_lock(&fs_devices->device_list_mutex);
0557 list_for_each_entry_safe(device, tmp_device,
0558 &fs_devices->devices, dev_list) {
0559 if (skip_device && skip_device == device)
0560 continue;
0561 if (devt && devt != device->devt)
0562 continue;
0563 if (fs_devices->opened) {
0564
0565 if (devt && ret != 0)
0566 ret = -EBUSY;
0567 break;
0568 }
0569
0570
0571 fs_devices->num_devices--;
0572 list_del(&device->dev_list);
0573 btrfs_free_device(device);
0574
0575 ret = 0;
0576 }
0577 mutex_unlock(&fs_devices->device_list_mutex);
0578
0579 if (fs_devices->num_devices == 0) {
0580 btrfs_sysfs_remove_fsid(fs_devices);
0581 list_del(&fs_devices->fs_list);
0582 free_fs_devices(fs_devices);
0583 }
0584 }
0585
0586 return ret;
0587 }
0588
0589
0590
0591
0592
0593
0594 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
0595 struct btrfs_device *device, fmode_t flags,
0596 void *holder)
0597 {
0598 struct block_device *bdev;
0599 struct btrfs_super_block *disk_super;
0600 u64 devid;
0601 int ret;
0602
0603 if (device->bdev)
0604 return -EINVAL;
0605 if (!device->name)
0606 return -EINVAL;
0607
0608 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
0609 &bdev, &disk_super);
0610 if (ret)
0611 return ret;
0612
0613 devid = btrfs_stack_device_id(&disk_super->dev_item);
0614 if (devid != device->devid)
0615 goto error_free_page;
0616
0617 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
0618 goto error_free_page;
0619
0620 device->generation = btrfs_super_generation(disk_super);
0621
0622 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
0623 if (btrfs_super_incompat_flags(disk_super) &
0624 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
0625 pr_err(
0626 "BTRFS: Invalid seeding and uuid-changed device detected\n");
0627 goto error_free_page;
0628 }
0629
0630 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
0631 fs_devices->seeding = true;
0632 } else {
0633 if (bdev_read_only(bdev))
0634 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
0635 else
0636 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
0637 }
0638
0639 if (!bdev_nonrot(bdev))
0640 fs_devices->rotating = true;
0641
0642 device->bdev = bdev;
0643 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
0644 device->mode = flags;
0645
0646 fs_devices->open_devices++;
0647 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
0648 device->devid != BTRFS_DEV_REPLACE_DEVID) {
0649 fs_devices->rw_devices++;
0650 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
0651 }
0652 btrfs_release_disk_super(disk_super);
0653
0654 return 0;
0655
0656 error_free_page:
0657 btrfs_release_disk_super(disk_super);
0658 blkdev_put(bdev, flags);
0659
0660 return -EINVAL;
0661 }
0662
0663
0664
0665
0666
0667
0668
0669 static struct btrfs_fs_devices *find_fsid_inprogress(
0670 struct btrfs_super_block *disk_super)
0671 {
0672 struct btrfs_fs_devices *fs_devices;
0673
0674 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
0675 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
0676 BTRFS_FSID_SIZE) != 0 &&
0677 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
0678 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
0679 return fs_devices;
0680 }
0681 }
0682
0683 return find_fsid(disk_super->fsid, NULL);
0684 }
0685
0686
0687 static struct btrfs_fs_devices *find_fsid_changed(
0688 struct btrfs_super_block *disk_super)
0689 {
0690 struct btrfs_fs_devices *fs_devices;
0691
0692
0693
0694
0695
0696
0697
0698
0699
0700
0701 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
0702
0703 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
0704 BTRFS_FSID_SIZE) != 0 &&
0705 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
0706 BTRFS_FSID_SIZE) == 0 &&
0707 memcmp(fs_devices->fsid, disk_super->fsid,
0708 BTRFS_FSID_SIZE) != 0)
0709 return fs_devices;
0710
0711
0712 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
0713 BTRFS_FSID_SIZE) == 0 &&
0714 memcmp(fs_devices->fsid, disk_super->metadata_uuid,
0715 BTRFS_FSID_SIZE) == 0)
0716 return fs_devices;
0717 }
0718
0719 return NULL;
0720 }
0721
0722 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
0723 struct btrfs_super_block *disk_super)
0724 {
0725 struct btrfs_fs_devices *fs_devices;
0726
0727
0728
0729
0730
0731
0732
0733
0734
0735
0736 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
0737 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
0738 BTRFS_FSID_SIZE) != 0 &&
0739 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
0740 BTRFS_FSID_SIZE) == 0 &&
0741 fs_devices->fsid_change)
0742 return fs_devices;
0743 }
0744
0745 return NULL;
0746 }
0747
0748
0749
0750
0751
0752
0753
0754 static noinline struct btrfs_device *device_list_add(const char *path,
0755 struct btrfs_super_block *disk_super,
0756 bool *new_device_added)
0757 {
0758 struct btrfs_device *device;
0759 struct btrfs_fs_devices *fs_devices = NULL;
0760 struct rcu_string *name;
0761 u64 found_transid = btrfs_super_generation(disk_super);
0762 u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
0763 dev_t path_devt;
0764 int error;
0765 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
0766 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
0767 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
0768 BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
0769
0770 error = lookup_bdev(path, &path_devt);
0771 if (error)
0772 return ERR_PTR(error);
0773
0774 if (fsid_change_in_progress) {
0775 if (!has_metadata_uuid)
0776 fs_devices = find_fsid_inprogress(disk_super);
0777 else
0778 fs_devices = find_fsid_changed(disk_super);
0779 } else if (has_metadata_uuid) {
0780 fs_devices = find_fsid_with_metadata_uuid(disk_super);
0781 } else {
0782 fs_devices = find_fsid_reverted_metadata(disk_super);
0783 if (!fs_devices)
0784 fs_devices = find_fsid(disk_super->fsid, NULL);
0785 }
0786
0787
0788 if (!fs_devices) {
0789 if (has_metadata_uuid)
0790 fs_devices = alloc_fs_devices(disk_super->fsid,
0791 disk_super->metadata_uuid);
0792 else
0793 fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
0794
0795 if (IS_ERR(fs_devices))
0796 return ERR_CAST(fs_devices);
0797
0798 fs_devices->fsid_change = fsid_change_in_progress;
0799
0800 mutex_lock(&fs_devices->device_list_mutex);
0801 list_add(&fs_devices->fs_list, &fs_uuids);
0802
0803 device = NULL;
0804 } else {
0805 struct btrfs_dev_lookup_args args = {
0806 .devid = devid,
0807 .uuid = disk_super->dev_item.uuid,
0808 };
0809
0810 mutex_lock(&fs_devices->device_list_mutex);
0811 device = btrfs_find_device(fs_devices, &args);
0812
0813
0814
0815
0816
0817
0818 if (fs_devices->fsid_change &&
0819 found_transid > fs_devices->latest_generation) {
0820 memcpy(fs_devices->fsid, disk_super->fsid,
0821 BTRFS_FSID_SIZE);
0822
0823 if (has_metadata_uuid)
0824 memcpy(fs_devices->metadata_uuid,
0825 disk_super->metadata_uuid,
0826 BTRFS_FSID_SIZE);
0827 else
0828 memcpy(fs_devices->metadata_uuid,
0829 disk_super->fsid, BTRFS_FSID_SIZE);
0830
0831 fs_devices->fsid_change = false;
0832 }
0833 }
0834
0835 if (!device) {
0836 if (fs_devices->opened) {
0837 mutex_unlock(&fs_devices->device_list_mutex);
0838 return ERR_PTR(-EBUSY);
0839 }
0840
0841 device = btrfs_alloc_device(NULL, &devid,
0842 disk_super->dev_item.uuid);
0843 if (IS_ERR(device)) {
0844 mutex_unlock(&fs_devices->device_list_mutex);
0845
0846 return device;
0847 }
0848
0849 name = rcu_string_strdup(path, GFP_NOFS);
0850 if (!name) {
0851 btrfs_free_device(device);
0852 mutex_unlock(&fs_devices->device_list_mutex);
0853 return ERR_PTR(-ENOMEM);
0854 }
0855 rcu_assign_pointer(device->name, name);
0856 device->devt = path_devt;
0857
0858 list_add_rcu(&device->dev_list, &fs_devices->devices);
0859 fs_devices->num_devices++;
0860
0861 device->fs_devices = fs_devices;
0862 *new_device_added = true;
0863
0864 if (disk_super->label[0])
0865 pr_info(
0866 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
0867 disk_super->label, devid, found_transid, path,
0868 current->comm, task_pid_nr(current));
0869 else
0870 pr_info(
0871 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
0872 disk_super->fsid, devid, found_transid, path,
0873 current->comm, task_pid_nr(current));
0874
0875 } else if (!device->name || strcmp(device->name->str, path)) {
0876
0877
0878
0879
0880
0881
0882
0883
0884
0885
0886
0887
0888
0889
0890
0891
0892
0893
0894
0895
0896
0897
0898
0899
0900
0901
0902 if (!fs_devices->opened && found_transid < device->generation) {
0903
0904
0905
0906
0907
0908
0909
0910 mutex_unlock(&fs_devices->device_list_mutex);
0911 return ERR_PTR(-EEXIST);
0912 }
0913
0914
0915
0916
0917
0918
0919
0920
0921
0922
0923 if (device->bdev) {
0924 if (device->devt != path_devt) {
0925 mutex_unlock(&fs_devices->device_list_mutex);
0926 btrfs_warn_in_rcu(NULL,
0927 "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
0928 path, devid, found_transid,
0929 current->comm,
0930 task_pid_nr(current));
0931 return ERR_PTR(-EEXIST);
0932 }
0933 btrfs_info_in_rcu(NULL,
0934 "devid %llu device path %s changed to %s scanned by %s (%d)",
0935 devid, rcu_str_deref(device->name),
0936 path, current->comm,
0937 task_pid_nr(current));
0938 }
0939
0940 name = rcu_string_strdup(path, GFP_NOFS);
0941 if (!name) {
0942 mutex_unlock(&fs_devices->device_list_mutex);
0943 return ERR_PTR(-ENOMEM);
0944 }
0945 rcu_string_free(device->name);
0946 rcu_assign_pointer(device->name, name);
0947 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
0948 fs_devices->missing_devices--;
0949 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
0950 }
0951 device->devt = path_devt;
0952 }
0953
0954
0955
0956
0957
0958
0959
0960 if (!fs_devices->opened) {
0961 device->generation = found_transid;
0962 fs_devices->latest_generation = max_t(u64, found_transid,
0963 fs_devices->latest_generation);
0964 }
0965
0966 fs_devices->total_devices = btrfs_super_num_devices(disk_super);
0967
0968 mutex_unlock(&fs_devices->device_list_mutex);
0969 return device;
0970 }
0971
0972 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
0973 {
0974 struct btrfs_fs_devices *fs_devices;
0975 struct btrfs_device *device;
0976 struct btrfs_device *orig_dev;
0977 int ret = 0;
0978
0979 lockdep_assert_held(&uuid_mutex);
0980
0981 fs_devices = alloc_fs_devices(orig->fsid, NULL);
0982 if (IS_ERR(fs_devices))
0983 return fs_devices;
0984
0985 fs_devices->total_devices = orig->total_devices;
0986
0987 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
0988 struct rcu_string *name;
0989
0990 device = btrfs_alloc_device(NULL, &orig_dev->devid,
0991 orig_dev->uuid);
0992 if (IS_ERR(device)) {
0993 ret = PTR_ERR(device);
0994 goto error;
0995 }
0996
0997
0998
0999
1000
1001 if (orig_dev->name) {
1002 name = rcu_string_strdup(orig_dev->name->str,
1003 GFP_KERNEL);
1004 if (!name) {
1005 btrfs_free_device(device);
1006 ret = -ENOMEM;
1007 goto error;
1008 }
1009 rcu_assign_pointer(device->name, name);
1010 }
1011
1012 list_add(&device->dev_list, &fs_devices->devices);
1013 device->fs_devices = fs_devices;
1014 fs_devices->num_devices++;
1015 }
1016 return fs_devices;
1017 error:
1018 free_fs_devices(fs_devices);
1019 return ERR_PTR(ret);
1020 }
1021
1022 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1023 struct btrfs_device **latest_dev)
1024 {
1025 struct btrfs_device *device, *next;
1026
1027
1028 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1029 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1030 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1031 &device->dev_state) &&
1032 !test_bit(BTRFS_DEV_STATE_MISSING,
1033 &device->dev_state) &&
1034 (!*latest_dev ||
1035 device->generation > (*latest_dev)->generation)) {
1036 *latest_dev = device;
1037 }
1038 continue;
1039 }
1040
1041
1042
1043
1044
1045 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1046 continue;
1047
1048 if (device->bdev) {
1049 blkdev_put(device->bdev, device->mode);
1050 device->bdev = NULL;
1051 fs_devices->open_devices--;
1052 }
1053 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1054 list_del_init(&device->dev_alloc_list);
1055 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1056 fs_devices->rw_devices--;
1057 }
1058 list_del_init(&device->dev_list);
1059 fs_devices->num_devices--;
1060 btrfs_free_device(device);
1061 }
1062
1063 }
1064
1065
1066
1067
1068
1069 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1070 {
1071 struct btrfs_device *latest_dev = NULL;
1072 struct btrfs_fs_devices *seed_dev;
1073
1074 mutex_lock(&uuid_mutex);
1075 __btrfs_free_extra_devids(fs_devices, &latest_dev);
1076
1077 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1078 __btrfs_free_extra_devids(seed_dev, &latest_dev);
1079
1080 fs_devices->latest_dev = latest_dev;
1081
1082 mutex_unlock(&uuid_mutex);
1083 }
1084
1085 static void btrfs_close_bdev(struct btrfs_device *device)
1086 {
1087 if (!device->bdev)
1088 return;
1089
1090 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1091 sync_blockdev(device->bdev);
1092 invalidate_bdev(device->bdev);
1093 }
1094
1095 blkdev_put(device->bdev, device->mode);
1096 }
1097
1098 static void btrfs_close_one_device(struct btrfs_device *device)
1099 {
1100 struct btrfs_fs_devices *fs_devices = device->fs_devices;
1101
1102 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1103 device->devid != BTRFS_DEV_REPLACE_DEVID) {
1104 list_del_init(&device->dev_alloc_list);
1105 fs_devices->rw_devices--;
1106 }
1107
1108 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1109 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1110
1111 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1112 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1113 fs_devices->missing_devices--;
1114 }
1115
1116 btrfs_close_bdev(device);
1117 if (device->bdev) {
1118 fs_devices->open_devices--;
1119 device->bdev = NULL;
1120 }
1121 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1122 btrfs_destroy_dev_zone_info(device);
1123
1124 device->fs_info = NULL;
1125 atomic_set(&device->dev_stats_ccnt, 0);
1126 extent_io_tree_release(&device->alloc_state);
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139 device->last_flush_error = 0;
1140
1141
1142 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1143 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1144 ASSERT(list_empty(&device->dev_alloc_list));
1145 ASSERT(list_empty(&device->post_commit_list));
1146 }
1147
1148 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1149 {
1150 struct btrfs_device *device, *tmp;
1151
1152 lockdep_assert_held(&uuid_mutex);
1153
1154 if (--fs_devices->opened > 0)
1155 return;
1156
1157 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1158 btrfs_close_one_device(device);
1159
1160 WARN_ON(fs_devices->open_devices);
1161 WARN_ON(fs_devices->rw_devices);
1162 fs_devices->opened = 0;
1163 fs_devices->seeding = false;
1164 fs_devices->fs_info = NULL;
1165 }
1166
1167 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1168 {
1169 LIST_HEAD(list);
1170 struct btrfs_fs_devices *tmp;
1171
1172 mutex_lock(&uuid_mutex);
1173 close_fs_devices(fs_devices);
1174 if (!fs_devices->opened)
1175 list_splice_init(&fs_devices->seed_list, &list);
1176
1177 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1178 close_fs_devices(fs_devices);
1179 list_del(&fs_devices->seed_list);
1180 free_fs_devices(fs_devices);
1181 }
1182 mutex_unlock(&uuid_mutex);
1183 }
1184
1185 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1186 fmode_t flags, void *holder)
1187 {
1188 struct btrfs_device *device;
1189 struct btrfs_device *latest_dev = NULL;
1190 struct btrfs_device *tmp_device;
1191
1192 flags |= FMODE_EXCL;
1193
1194 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1195 dev_list) {
1196 int ret;
1197
1198 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1199 if (ret == 0 &&
1200 (!latest_dev || device->generation > latest_dev->generation)) {
1201 latest_dev = device;
1202 } else if (ret == -ENODATA) {
1203 fs_devices->num_devices--;
1204 list_del(&device->dev_list);
1205 btrfs_free_device(device);
1206 }
1207 }
1208 if (fs_devices->open_devices == 0)
1209 return -EINVAL;
1210
1211 fs_devices->opened = 1;
1212 fs_devices->latest_dev = latest_dev;
1213 fs_devices->total_rw_bytes = 0;
1214 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1215 fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1216
1217 return 0;
1218 }
1219
1220 static int devid_cmp(void *priv, const struct list_head *a,
1221 const struct list_head *b)
1222 {
1223 const struct btrfs_device *dev1, *dev2;
1224
1225 dev1 = list_entry(a, struct btrfs_device, dev_list);
1226 dev2 = list_entry(b, struct btrfs_device, dev_list);
1227
1228 if (dev1->devid < dev2->devid)
1229 return -1;
1230 else if (dev1->devid > dev2->devid)
1231 return 1;
1232 return 0;
1233 }
1234
1235 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1236 fmode_t flags, void *holder)
1237 {
1238 int ret;
1239
1240 lockdep_assert_held(&uuid_mutex);
1241
1242
1243
1244
1245
1246
1247
1248
1249 if (fs_devices->opened) {
1250 fs_devices->opened++;
1251 ret = 0;
1252 } else {
1253 list_sort(NULL, &fs_devices->devices, devid_cmp);
1254 ret = open_fs_devices(fs_devices, flags, holder);
1255 }
1256
1257 return ret;
1258 }
1259
1260 void btrfs_release_disk_super(struct btrfs_super_block *super)
1261 {
1262 struct page *page = virt_to_page(super);
1263
1264 put_page(page);
1265 }
1266
1267 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1268 u64 bytenr, u64 bytenr_orig)
1269 {
1270 struct btrfs_super_block *disk_super;
1271 struct page *page;
1272 void *p;
1273 pgoff_t index;
1274
1275
1276 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1277 return ERR_PTR(-EINVAL);
1278
1279
1280 if (sizeof(*disk_super) > PAGE_SIZE)
1281 return ERR_PTR(-EINVAL);
1282
1283
1284 index = bytenr >> PAGE_SHIFT;
1285 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1286 return ERR_PTR(-EINVAL);
1287
1288
1289 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1290
1291 if (IS_ERR(page))
1292 return ERR_CAST(page);
1293
1294 p = page_address(page);
1295
1296
1297 disk_super = p + offset_in_page(bytenr);
1298
1299 if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1300 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1301 btrfs_release_disk_super(p);
1302 return ERR_PTR(-EINVAL);
1303 }
1304
1305 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1306 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1307
1308 return disk_super;
1309 }
1310
1311 int btrfs_forget_devices(dev_t devt)
1312 {
1313 int ret;
1314
1315 mutex_lock(&uuid_mutex);
1316 ret = btrfs_free_stale_devices(devt, NULL);
1317 mutex_unlock(&uuid_mutex);
1318
1319 return ret;
1320 }
1321
1322
1323
1324
1325
1326
1327 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1328 void *holder)
1329 {
1330 struct btrfs_super_block *disk_super;
1331 bool new_device_added = false;
1332 struct btrfs_device *device = NULL;
1333 struct block_device *bdev;
1334 u64 bytenr, bytenr_orig;
1335 int ret;
1336
1337 lockdep_assert_held(&uuid_mutex);
1338
1339
1340
1341
1342
1343
1344
1345 flags |= FMODE_EXCL;
1346
1347 bdev = blkdev_get_by_path(path, flags, holder);
1348 if (IS_ERR(bdev))
1349 return ERR_CAST(bdev);
1350
1351 bytenr_orig = btrfs_sb_offset(0);
1352 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1353 if (ret) {
1354 device = ERR_PTR(ret);
1355 goto error_bdev_put;
1356 }
1357
1358 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1359 if (IS_ERR(disk_super)) {
1360 device = ERR_CAST(disk_super);
1361 goto error_bdev_put;
1362 }
1363
1364 device = device_list_add(path, disk_super, &new_device_added);
1365 if (!IS_ERR(device) && new_device_added)
1366 btrfs_free_stale_devices(device->devt, device);
1367
1368 btrfs_release_disk_super(disk_super);
1369
1370 error_bdev_put:
1371 blkdev_put(bdev, flags);
1372
1373 return device;
1374 }
1375
1376
1377
1378
1379
1380 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1381 u64 len)
1382 {
1383 u64 physical_start, physical_end;
1384
1385 lockdep_assert_held(&device->fs_info->chunk_mutex);
1386
1387 if (!find_first_extent_bit(&device->alloc_state, *start,
1388 &physical_start, &physical_end,
1389 CHUNK_ALLOCATED, NULL)) {
1390
1391 if (in_range(physical_start, *start, len) ||
1392 in_range(*start, physical_start,
1393 physical_end - physical_start)) {
1394 *start = physical_end + 1;
1395 return true;
1396 }
1397 }
1398 return false;
1399 }
1400
1401 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1402 {
1403 switch (device->fs_devices->chunk_alloc_policy) {
1404 case BTRFS_CHUNK_ALLOC_REGULAR:
1405 return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
1406 case BTRFS_CHUNK_ALLOC_ZONED:
1407
1408
1409
1410
1411
1412 return ALIGN(start, device->zone_info->zone_size);
1413 default:
1414 BUG();
1415 }
1416 }
1417
1418 static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1419 u64 *hole_start, u64 *hole_size,
1420 u64 num_bytes)
1421 {
1422 u64 zone_size = device->zone_info->zone_size;
1423 u64 pos;
1424 int ret;
1425 bool changed = false;
1426
1427 ASSERT(IS_ALIGNED(*hole_start, zone_size));
1428
1429 while (*hole_size > 0) {
1430 pos = btrfs_find_allocatable_zones(device, *hole_start,
1431 *hole_start + *hole_size,
1432 num_bytes);
1433 if (pos != *hole_start) {
1434 *hole_size = *hole_start + *hole_size - pos;
1435 *hole_start = pos;
1436 changed = true;
1437 if (*hole_size < num_bytes)
1438 break;
1439 }
1440
1441 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1442
1443
1444 if (!ret)
1445 return changed;
1446
1447
1448 if (ret == -ERANGE) {
1449 *hole_start += *hole_size;
1450 *hole_size = 0;
1451 return true;
1452 }
1453
1454 *hole_start += zone_size;
1455 *hole_size -= zone_size;
1456 changed = true;
1457 }
1458
1459 return changed;
1460 }
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1473 u64 *hole_size, u64 num_bytes)
1474 {
1475 bool changed = false;
1476 u64 hole_end = *hole_start + *hole_size;
1477
1478 for (;;) {
1479
1480
1481
1482
1483 if (contains_pending_extent(device, hole_start, *hole_size)) {
1484 if (hole_end >= *hole_start)
1485 *hole_size = hole_end - *hole_start;
1486 else
1487 *hole_size = 0;
1488 changed = true;
1489 }
1490
1491 switch (device->fs_devices->chunk_alloc_policy) {
1492 case BTRFS_CHUNK_ALLOC_REGULAR:
1493
1494 break;
1495 case BTRFS_CHUNK_ALLOC_ZONED:
1496 if (dev_extent_hole_check_zoned(device, hole_start,
1497 hole_size, num_bytes)) {
1498 changed = true;
1499
1500
1501
1502
1503 continue;
1504 }
1505 break;
1506 default:
1507 BUG();
1508 }
1509
1510 break;
1511 }
1512
1513 return changed;
1514 }
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543 static int find_free_dev_extent_start(struct btrfs_device *device,
1544 u64 num_bytes, u64 search_start, u64 *start,
1545 u64 *len)
1546 {
1547 struct btrfs_fs_info *fs_info = device->fs_info;
1548 struct btrfs_root *root = fs_info->dev_root;
1549 struct btrfs_key key;
1550 struct btrfs_dev_extent *dev_extent;
1551 struct btrfs_path *path;
1552 u64 hole_size;
1553 u64 max_hole_start;
1554 u64 max_hole_size;
1555 u64 extent_end;
1556 u64 search_end = device->total_bytes;
1557 int ret;
1558 int slot;
1559 struct extent_buffer *l;
1560
1561 search_start = dev_extent_search_start(device, search_start);
1562
1563 WARN_ON(device->zone_info &&
1564 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1565
1566 path = btrfs_alloc_path();
1567 if (!path)
1568 return -ENOMEM;
1569
1570 max_hole_start = search_start;
1571 max_hole_size = 0;
1572
1573 again:
1574 if (search_start >= search_end ||
1575 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1576 ret = -ENOSPC;
1577 goto out;
1578 }
1579
1580 path->reada = READA_FORWARD;
1581 path->search_commit_root = 1;
1582 path->skip_locking = 1;
1583
1584 key.objectid = device->devid;
1585 key.offset = search_start;
1586 key.type = BTRFS_DEV_EXTENT_KEY;
1587
1588 ret = btrfs_search_backwards(root, &key, path);
1589 if (ret < 0)
1590 goto out;
1591
1592 while (1) {
1593 l = path->nodes[0];
1594 slot = path->slots[0];
1595 if (slot >= btrfs_header_nritems(l)) {
1596 ret = btrfs_next_leaf(root, path);
1597 if (ret == 0)
1598 continue;
1599 if (ret < 0)
1600 goto out;
1601
1602 break;
1603 }
1604 btrfs_item_key_to_cpu(l, &key, slot);
1605
1606 if (key.objectid < device->devid)
1607 goto next;
1608
1609 if (key.objectid > device->devid)
1610 break;
1611
1612 if (key.type != BTRFS_DEV_EXTENT_KEY)
1613 goto next;
1614
1615 if (key.offset > search_start) {
1616 hole_size = key.offset - search_start;
1617 dev_extent_hole_check(device, &search_start, &hole_size,
1618 num_bytes);
1619
1620 if (hole_size > max_hole_size) {
1621 max_hole_start = search_start;
1622 max_hole_size = hole_size;
1623 }
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634 if (hole_size >= num_bytes) {
1635 ret = 0;
1636 goto out;
1637 }
1638 }
1639
1640 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1641 extent_end = key.offset + btrfs_dev_extent_length(l,
1642 dev_extent);
1643 if (extent_end > search_start)
1644 search_start = extent_end;
1645 next:
1646 path->slots[0]++;
1647 cond_resched();
1648 }
1649
1650
1651
1652
1653
1654
1655 if (search_end > search_start) {
1656 hole_size = search_end - search_start;
1657 if (dev_extent_hole_check(device, &search_start, &hole_size,
1658 num_bytes)) {
1659 btrfs_release_path(path);
1660 goto again;
1661 }
1662
1663 if (hole_size > max_hole_size) {
1664 max_hole_start = search_start;
1665 max_hole_size = hole_size;
1666 }
1667 }
1668
1669
1670 if (max_hole_size < num_bytes)
1671 ret = -ENOSPC;
1672 else
1673 ret = 0;
1674
1675 out:
1676 btrfs_free_path(path);
1677 *start = max_hole_start;
1678 if (len)
1679 *len = max_hole_size;
1680 return ret;
1681 }
1682
1683 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1684 u64 *start, u64 *len)
1685 {
1686
1687 return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1688 }
1689
1690 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1691 struct btrfs_device *device,
1692 u64 start, u64 *dev_extent_len)
1693 {
1694 struct btrfs_fs_info *fs_info = device->fs_info;
1695 struct btrfs_root *root = fs_info->dev_root;
1696 int ret;
1697 struct btrfs_path *path;
1698 struct btrfs_key key;
1699 struct btrfs_key found_key;
1700 struct extent_buffer *leaf = NULL;
1701 struct btrfs_dev_extent *extent = NULL;
1702
1703 path = btrfs_alloc_path();
1704 if (!path)
1705 return -ENOMEM;
1706
1707 key.objectid = device->devid;
1708 key.offset = start;
1709 key.type = BTRFS_DEV_EXTENT_KEY;
1710 again:
1711 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1712 if (ret > 0) {
1713 ret = btrfs_previous_item(root, path, key.objectid,
1714 BTRFS_DEV_EXTENT_KEY);
1715 if (ret)
1716 goto out;
1717 leaf = path->nodes[0];
1718 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1719 extent = btrfs_item_ptr(leaf, path->slots[0],
1720 struct btrfs_dev_extent);
1721 BUG_ON(found_key.offset > start || found_key.offset +
1722 btrfs_dev_extent_length(leaf, extent) < start);
1723 key = found_key;
1724 btrfs_release_path(path);
1725 goto again;
1726 } else if (ret == 0) {
1727 leaf = path->nodes[0];
1728 extent = btrfs_item_ptr(leaf, path->slots[0],
1729 struct btrfs_dev_extent);
1730 } else {
1731 goto out;
1732 }
1733
1734 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1735
1736 ret = btrfs_del_item(trans, root, path);
1737 if (ret == 0)
1738 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1739 out:
1740 btrfs_free_path(path);
1741 return ret;
1742 }
1743
1744 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1745 {
1746 struct extent_map_tree *em_tree;
1747 struct extent_map *em;
1748 struct rb_node *n;
1749 u64 ret = 0;
1750
1751 em_tree = &fs_info->mapping_tree;
1752 read_lock(&em_tree->lock);
1753 n = rb_last(&em_tree->map.rb_root);
1754 if (n) {
1755 em = rb_entry(n, struct extent_map, rb_node);
1756 ret = em->start + em->len;
1757 }
1758 read_unlock(&em_tree->lock);
1759
1760 return ret;
1761 }
1762
1763 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1764 u64 *devid_ret)
1765 {
1766 int ret;
1767 struct btrfs_key key;
1768 struct btrfs_key found_key;
1769 struct btrfs_path *path;
1770
1771 path = btrfs_alloc_path();
1772 if (!path)
1773 return -ENOMEM;
1774
1775 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1776 key.type = BTRFS_DEV_ITEM_KEY;
1777 key.offset = (u64)-1;
1778
1779 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1780 if (ret < 0)
1781 goto error;
1782
1783 if (ret == 0) {
1784
1785 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1786 ret = -EUCLEAN;
1787 goto error;
1788 }
1789
1790 ret = btrfs_previous_item(fs_info->chunk_root, path,
1791 BTRFS_DEV_ITEMS_OBJECTID,
1792 BTRFS_DEV_ITEM_KEY);
1793 if (ret) {
1794 *devid_ret = 1;
1795 } else {
1796 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1797 path->slots[0]);
1798 *devid_ret = found_key.offset + 1;
1799 }
1800 ret = 0;
1801 error:
1802 btrfs_free_path(path);
1803 return ret;
1804 }
1805
1806
1807
1808
1809
1810 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1811 struct btrfs_device *device)
1812 {
1813 int ret;
1814 struct btrfs_path *path;
1815 struct btrfs_dev_item *dev_item;
1816 struct extent_buffer *leaf;
1817 struct btrfs_key key;
1818 unsigned long ptr;
1819
1820 path = btrfs_alloc_path();
1821 if (!path)
1822 return -ENOMEM;
1823
1824 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1825 key.type = BTRFS_DEV_ITEM_KEY;
1826 key.offset = device->devid;
1827
1828 btrfs_reserve_chunk_metadata(trans, true);
1829 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1830 &key, sizeof(*dev_item));
1831 btrfs_trans_release_chunk_metadata(trans);
1832 if (ret)
1833 goto out;
1834
1835 leaf = path->nodes[0];
1836 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1837
1838 btrfs_set_device_id(leaf, dev_item, device->devid);
1839 btrfs_set_device_generation(leaf, dev_item, 0);
1840 btrfs_set_device_type(leaf, dev_item, device->type);
1841 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1842 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1843 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1844 btrfs_set_device_total_bytes(leaf, dev_item,
1845 btrfs_device_get_disk_total_bytes(device));
1846 btrfs_set_device_bytes_used(leaf, dev_item,
1847 btrfs_device_get_bytes_used(device));
1848 btrfs_set_device_group(leaf, dev_item, 0);
1849 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1850 btrfs_set_device_bandwidth(leaf, dev_item, 0);
1851 btrfs_set_device_start_offset(leaf, dev_item, 0);
1852
1853 ptr = btrfs_device_uuid(dev_item);
1854 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1855 ptr = btrfs_device_fsid(dev_item);
1856 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1857 ptr, BTRFS_FSID_SIZE);
1858 btrfs_mark_buffer_dirty(leaf);
1859
1860 ret = 0;
1861 out:
1862 btrfs_free_path(path);
1863 return ret;
1864 }
1865
1866
1867
1868
1869
1870
1871
1872 static void update_dev_time(const char *device_path)
1873 {
1874 struct path path;
1875 struct timespec64 now;
1876 int ret;
1877
1878 ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1879 if (ret)
1880 return;
1881
1882 now = current_time(d_inode(path.dentry));
1883 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1884 path_put(&path);
1885 }
1886
1887 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1888 struct btrfs_device *device)
1889 {
1890 struct btrfs_root *root = device->fs_info->chunk_root;
1891 int ret;
1892 struct btrfs_path *path;
1893 struct btrfs_key key;
1894
1895 path = btrfs_alloc_path();
1896 if (!path)
1897 return -ENOMEM;
1898
1899 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1900 key.type = BTRFS_DEV_ITEM_KEY;
1901 key.offset = device->devid;
1902
1903 btrfs_reserve_chunk_metadata(trans, false);
1904 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1905 btrfs_trans_release_chunk_metadata(trans);
1906 if (ret) {
1907 if (ret > 0)
1908 ret = -ENOENT;
1909 goto out;
1910 }
1911
1912 ret = btrfs_del_item(trans, root, path);
1913 out:
1914 btrfs_free_path(path);
1915 return ret;
1916 }
1917
1918
1919
1920
1921
1922
1923 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1924 u64 num_devices)
1925 {
1926 u64 all_avail;
1927 unsigned seq;
1928 int i;
1929
1930 do {
1931 seq = read_seqbegin(&fs_info->profiles_lock);
1932
1933 all_avail = fs_info->avail_data_alloc_bits |
1934 fs_info->avail_system_alloc_bits |
1935 fs_info->avail_metadata_alloc_bits;
1936 } while (read_seqretry(&fs_info->profiles_lock, seq));
1937
1938 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1939 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1940 continue;
1941
1942 if (num_devices < btrfs_raid_array[i].devs_min)
1943 return btrfs_raid_array[i].mindev_error;
1944 }
1945
1946 return 0;
1947 }
1948
1949 static struct btrfs_device * btrfs_find_next_active_device(
1950 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1951 {
1952 struct btrfs_device *next_device;
1953
1954 list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1955 if (next_device != device &&
1956 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1957 && next_device->bdev)
1958 return next_device;
1959 }
1960
1961 return NULL;
1962 }
1963
1964
1965
1966
1967
1968
1969
1970 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1971 struct btrfs_device *next_device)
1972 {
1973 struct btrfs_fs_info *fs_info = device->fs_info;
1974
1975 if (!next_device)
1976 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1977 device);
1978 ASSERT(next_device);
1979
1980 if (fs_info->sb->s_bdev &&
1981 (fs_info->sb->s_bdev == device->bdev))
1982 fs_info->sb->s_bdev = next_device->bdev;
1983
1984 if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
1985 fs_info->fs_devices->latest_dev = next_device;
1986 }
1987
1988
1989
1990
1991
1992 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1993 {
1994 u64 num_devices = fs_info->fs_devices->num_devices;
1995
1996 down_read(&fs_info->dev_replace.rwsem);
1997 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1998 ASSERT(num_devices > 1);
1999 num_devices--;
2000 }
2001 up_read(&fs_info->dev_replace.rwsem);
2002
2003 return num_devices;
2004 }
2005
2006 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2007 struct block_device *bdev,
2008 const char *device_path)
2009 {
2010 struct btrfs_super_block *disk_super;
2011 int copy_num;
2012
2013 if (!bdev)
2014 return;
2015
2016 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2017 struct page *page;
2018 int ret;
2019
2020 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2021 if (IS_ERR(disk_super))
2022 continue;
2023
2024 if (bdev_is_zoned(bdev)) {
2025 btrfs_reset_sb_log_zones(bdev, copy_num);
2026 continue;
2027 }
2028
2029 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2030
2031 page = virt_to_page(disk_super);
2032 set_page_dirty(page);
2033 lock_page(page);
2034
2035 ret = write_one_page(page);
2036 if (ret)
2037 btrfs_warn(fs_info,
2038 "error clearing superblock number %d (%d)",
2039 copy_num, ret);
2040 btrfs_release_disk_super(disk_super);
2041
2042 }
2043
2044
2045 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2046
2047
2048 update_dev_time(device_path);
2049 }
2050
2051 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
2052 struct btrfs_dev_lookup_args *args,
2053 struct block_device **bdev, fmode_t *mode)
2054 {
2055 struct btrfs_trans_handle *trans;
2056 struct btrfs_device *device;
2057 struct btrfs_fs_devices *cur_devices;
2058 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2059 u64 num_devices;
2060 int ret = 0;
2061
2062 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2063 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2064 return -EINVAL;
2065 }
2066
2067
2068
2069
2070
2071
2072 num_devices = btrfs_num_devices(fs_info);
2073
2074 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2075 if (ret)
2076 return ret;
2077
2078 device = btrfs_find_device(fs_info->fs_devices, args);
2079 if (!device) {
2080 if (args->missing)
2081 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2082 else
2083 ret = -ENOENT;
2084 return ret;
2085 }
2086
2087 if (btrfs_pinned_by_swapfile(fs_info, device)) {
2088 btrfs_warn_in_rcu(fs_info,
2089 "cannot remove device %s (devid %llu) due to active swapfile",
2090 rcu_str_deref(device->name), device->devid);
2091 return -ETXTBSY;
2092 }
2093
2094 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2095 return BTRFS_ERROR_DEV_TGT_REPLACE;
2096
2097 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2098 fs_info->fs_devices->rw_devices == 1)
2099 return BTRFS_ERROR_DEV_ONLY_WRITABLE;
2100
2101 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2102 mutex_lock(&fs_info->chunk_mutex);
2103 list_del_init(&device->dev_alloc_list);
2104 device->fs_devices->rw_devices--;
2105 mutex_unlock(&fs_info->chunk_mutex);
2106 }
2107
2108 ret = btrfs_shrink_device(device, 0);
2109 if (ret)
2110 goto error_undo;
2111
2112 trans = btrfs_start_transaction(fs_info->chunk_root, 0);
2113 if (IS_ERR(trans)) {
2114 ret = PTR_ERR(trans);
2115 goto error_undo;
2116 }
2117
2118 ret = btrfs_rm_dev_item(trans, device);
2119 if (ret) {
2120
2121 btrfs_crit(fs_info,
2122 "failed to remove device item for devid %llu: %d",
2123 device->devid, ret);
2124 btrfs_abort_transaction(trans, ret);
2125 btrfs_end_transaction(trans);
2126 return ret;
2127 }
2128
2129 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2130 btrfs_scrub_cancel_dev(device);
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147 cur_devices = device->fs_devices;
2148 mutex_lock(&fs_devices->device_list_mutex);
2149 list_del_rcu(&device->dev_list);
2150
2151 cur_devices->num_devices--;
2152 cur_devices->total_devices--;
2153
2154 if (cur_devices != fs_devices)
2155 fs_devices->total_devices--;
2156
2157 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2158 cur_devices->missing_devices--;
2159
2160 btrfs_assign_next_active_device(device, NULL);
2161
2162 if (device->bdev) {
2163 cur_devices->open_devices--;
2164
2165 btrfs_sysfs_remove_device(device);
2166 }
2167
2168 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2169 btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2170 mutex_unlock(&fs_devices->device_list_mutex);
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2183 btrfs_scratch_superblocks(fs_info, device->bdev,
2184 device->name->str);
2185 if (device->bdev) {
2186 sync_blockdev(device->bdev);
2187 invalidate_bdev(device->bdev);
2188 }
2189 }
2190
2191 *bdev = device->bdev;
2192 *mode = device->mode;
2193 synchronize_rcu();
2194 btrfs_free_device(device);
2195
2196
2197
2198
2199
2200
2201
2202
2203 if (cur_devices->num_devices == 0) {
2204 list_del_init(&cur_devices->seed_list);
2205 ASSERT(cur_devices->opened == 1);
2206 cur_devices->opened--;
2207 free_fs_devices(cur_devices);
2208 }
2209
2210 ret = btrfs_commit_transaction(trans);
2211
2212 return ret;
2213
2214 error_undo:
2215 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2216 mutex_lock(&fs_info->chunk_mutex);
2217 list_add(&device->dev_alloc_list,
2218 &fs_devices->alloc_list);
2219 device->fs_devices->rw_devices++;
2220 mutex_unlock(&fs_info->chunk_mutex);
2221 }
2222 return ret;
2223 }
2224
2225 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2226 {
2227 struct btrfs_fs_devices *fs_devices;
2228
2229 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2230
2231
2232
2233
2234
2235
2236
2237 fs_devices = srcdev->fs_devices;
2238
2239 list_del_rcu(&srcdev->dev_list);
2240 list_del(&srcdev->dev_alloc_list);
2241 fs_devices->num_devices--;
2242 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2243 fs_devices->missing_devices--;
2244
2245 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2246 fs_devices->rw_devices--;
2247
2248 if (srcdev->bdev)
2249 fs_devices->open_devices--;
2250 }
2251
2252 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2253 {
2254 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2255
2256 mutex_lock(&uuid_mutex);
2257
2258 btrfs_close_bdev(srcdev);
2259 synchronize_rcu();
2260 btrfs_free_device(srcdev);
2261
2262
2263 if (!fs_devices->num_devices) {
2264
2265
2266
2267
2268
2269
2270 ASSERT(fs_devices->seeding);
2271
2272 list_del_init(&fs_devices->seed_list);
2273 close_fs_devices(fs_devices);
2274 free_fs_devices(fs_devices);
2275 }
2276 mutex_unlock(&uuid_mutex);
2277 }
2278
2279 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2280 {
2281 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2282
2283 mutex_lock(&fs_devices->device_list_mutex);
2284
2285 btrfs_sysfs_remove_device(tgtdev);
2286
2287 if (tgtdev->bdev)
2288 fs_devices->open_devices--;
2289
2290 fs_devices->num_devices--;
2291
2292 btrfs_assign_next_active_device(tgtdev, NULL);
2293
2294 list_del_rcu(&tgtdev->dev_list);
2295
2296 mutex_unlock(&fs_devices->device_list_mutex);
2297
2298 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2299 tgtdev->name->str);
2300
2301 btrfs_close_bdev(tgtdev);
2302 synchronize_rcu();
2303 btrfs_free_device(tgtdev);
2304 }
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2325 struct btrfs_dev_lookup_args *args,
2326 const char *path)
2327 {
2328 struct btrfs_super_block *disk_super;
2329 struct block_device *bdev;
2330 int ret;
2331
2332 if (!path || !path[0])
2333 return -EINVAL;
2334 if (!strcmp(path, "missing")) {
2335 args->missing = true;
2336 return 0;
2337 }
2338
2339 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2340 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2341 if (!args->uuid || !args->fsid) {
2342 btrfs_put_dev_args_from_path(args);
2343 return -ENOMEM;
2344 }
2345
2346 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
2347 &bdev, &disk_super);
2348 if (ret) {
2349 btrfs_put_dev_args_from_path(args);
2350 return ret;
2351 }
2352
2353 args->devid = btrfs_stack_device_id(&disk_super->dev_item);
2354 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
2355 if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2356 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
2357 else
2358 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
2359 btrfs_release_disk_super(disk_super);
2360 blkdev_put(bdev, FMODE_READ);
2361 return 0;
2362 }
2363
2364
2365
2366
2367
2368
2369 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2370 {
2371 kfree(args->uuid);
2372 kfree(args->fsid);
2373 args->uuid = NULL;
2374 args->fsid = NULL;
2375 }
2376
2377 struct btrfs_device *btrfs_find_device_by_devspec(
2378 struct btrfs_fs_info *fs_info, u64 devid,
2379 const char *device_path)
2380 {
2381 BTRFS_DEV_LOOKUP_ARGS(args);
2382 struct btrfs_device *device;
2383 int ret;
2384
2385 if (devid) {
2386 args.devid = devid;
2387 device = btrfs_find_device(fs_info->fs_devices, &args);
2388 if (!device)
2389 return ERR_PTR(-ENOENT);
2390 return device;
2391 }
2392
2393 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
2394 if (ret)
2395 return ERR_PTR(ret);
2396 device = btrfs_find_device(fs_info->fs_devices, &args);
2397 btrfs_put_dev_args_from_path(&args);
2398 if (!device)
2399 return ERR_PTR(-ENOENT);
2400 return device;
2401 }
2402
2403 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
2404 {
2405 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2406 struct btrfs_fs_devices *old_devices;
2407 struct btrfs_fs_devices *seed_devices;
2408
2409 lockdep_assert_held(&uuid_mutex);
2410 if (!fs_devices->seeding)
2411 return ERR_PTR(-EINVAL);
2412
2413
2414
2415
2416
2417 seed_devices = alloc_fs_devices(NULL, NULL);
2418 if (IS_ERR(seed_devices))
2419 return seed_devices;
2420
2421
2422
2423
2424
2425
2426
2427 old_devices = clone_fs_devices(fs_devices);
2428 if (IS_ERR(old_devices)) {
2429 kfree(seed_devices);
2430 return old_devices;
2431 }
2432
2433 list_add(&old_devices->fs_list, &fs_uuids);
2434
2435 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2436 seed_devices->opened = 1;
2437 INIT_LIST_HEAD(&seed_devices->devices);
2438 INIT_LIST_HEAD(&seed_devices->alloc_list);
2439 mutex_init(&seed_devices->device_list_mutex);
2440
2441 return seed_devices;
2442 }
2443
2444
2445
2446
2447
2448 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2449 struct btrfs_fs_devices *seed_devices)
2450 {
2451 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2452 struct btrfs_super_block *disk_super = fs_info->super_copy;
2453 struct btrfs_device *device;
2454 u64 super_flags;
2455
2456
2457
2458
2459
2460 lockdep_assert_held(&uuid_mutex);
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474 lockdep_assert_held(&fs_devices->device_list_mutex);
2475
2476 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2477 synchronize_rcu);
2478 list_for_each_entry(device, &seed_devices->devices, dev_list)
2479 device->fs_devices = seed_devices;
2480
2481 fs_devices->seeding = false;
2482 fs_devices->num_devices = 0;
2483 fs_devices->open_devices = 0;
2484 fs_devices->missing_devices = 0;
2485 fs_devices->rotating = false;
2486 list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2487
2488 generate_random_uuid(fs_devices->fsid);
2489 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2490 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2491
2492 super_flags = btrfs_super_flags(disk_super) &
2493 ~BTRFS_SUPER_FLAG_SEEDING;
2494 btrfs_set_super_flags(disk_super, super_flags);
2495 }
2496
2497
2498
2499
2500 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2501 {
2502 BTRFS_DEV_LOOKUP_ARGS(args);
2503 struct btrfs_fs_info *fs_info = trans->fs_info;
2504 struct btrfs_root *root = fs_info->chunk_root;
2505 struct btrfs_path *path;
2506 struct extent_buffer *leaf;
2507 struct btrfs_dev_item *dev_item;
2508 struct btrfs_device *device;
2509 struct btrfs_key key;
2510 u8 fs_uuid[BTRFS_FSID_SIZE];
2511 u8 dev_uuid[BTRFS_UUID_SIZE];
2512 int ret;
2513
2514 path = btrfs_alloc_path();
2515 if (!path)
2516 return -ENOMEM;
2517
2518 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2519 key.offset = 0;
2520 key.type = BTRFS_DEV_ITEM_KEY;
2521
2522 while (1) {
2523 btrfs_reserve_chunk_metadata(trans, false);
2524 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2525 btrfs_trans_release_chunk_metadata(trans);
2526 if (ret < 0)
2527 goto error;
2528
2529 leaf = path->nodes[0];
2530 next_slot:
2531 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2532 ret = btrfs_next_leaf(root, path);
2533 if (ret > 0)
2534 break;
2535 if (ret < 0)
2536 goto error;
2537 leaf = path->nodes[0];
2538 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2539 btrfs_release_path(path);
2540 continue;
2541 }
2542
2543 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2544 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2545 key.type != BTRFS_DEV_ITEM_KEY)
2546 break;
2547
2548 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2549 struct btrfs_dev_item);
2550 args.devid = btrfs_device_id(leaf, dev_item);
2551 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2552 BTRFS_UUID_SIZE);
2553 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2554 BTRFS_FSID_SIZE);
2555 args.uuid = dev_uuid;
2556 args.fsid = fs_uuid;
2557 device = btrfs_find_device(fs_info->fs_devices, &args);
2558 BUG_ON(!device);
2559
2560 if (device->fs_devices->seeding) {
2561 btrfs_set_device_generation(leaf, dev_item,
2562 device->generation);
2563 btrfs_mark_buffer_dirty(leaf);
2564 }
2565
2566 path->slots[0]++;
2567 goto next_slot;
2568 }
2569 ret = 0;
2570 error:
2571 btrfs_free_path(path);
2572 return ret;
2573 }
2574
2575 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2576 {
2577 struct btrfs_root *root = fs_info->dev_root;
2578 struct btrfs_trans_handle *trans;
2579 struct btrfs_device *device;
2580 struct block_device *bdev;
2581 struct super_block *sb = fs_info->sb;
2582 struct rcu_string *name;
2583 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2584 struct btrfs_fs_devices *seed_devices;
2585 u64 orig_super_total_bytes;
2586 u64 orig_super_num_devices;
2587 int ret = 0;
2588 bool seeding_dev = false;
2589 bool locked = false;
2590
2591 if (sb_rdonly(sb) && !fs_devices->seeding)
2592 return -EROFS;
2593
2594 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2595 fs_info->bdev_holder);
2596 if (IS_ERR(bdev))
2597 return PTR_ERR(bdev);
2598
2599 if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2600 ret = -EINVAL;
2601 goto error;
2602 }
2603
2604 if (fs_devices->seeding) {
2605 seeding_dev = true;
2606 down_write(&sb->s_umount);
2607 mutex_lock(&uuid_mutex);
2608 locked = true;
2609 }
2610
2611 sync_blockdev(bdev);
2612
2613 rcu_read_lock();
2614 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2615 if (device->bdev == bdev) {
2616 ret = -EEXIST;
2617 rcu_read_unlock();
2618 goto error;
2619 }
2620 }
2621 rcu_read_unlock();
2622
2623 device = btrfs_alloc_device(fs_info, NULL, NULL);
2624 if (IS_ERR(device)) {
2625
2626 ret = PTR_ERR(device);
2627 goto error;
2628 }
2629
2630 name = rcu_string_strdup(device_path, GFP_KERNEL);
2631 if (!name) {
2632 ret = -ENOMEM;
2633 goto error_free_device;
2634 }
2635 rcu_assign_pointer(device->name, name);
2636
2637 device->fs_info = fs_info;
2638 device->bdev = bdev;
2639 ret = lookup_bdev(device_path, &device->devt);
2640 if (ret)
2641 goto error_free_device;
2642
2643 ret = btrfs_get_dev_zone_info(device, false);
2644 if (ret)
2645 goto error_free_device;
2646
2647 trans = btrfs_start_transaction(root, 0);
2648 if (IS_ERR(trans)) {
2649 ret = PTR_ERR(trans);
2650 goto error_free_zone;
2651 }
2652
2653 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2654 device->generation = trans->transid;
2655 device->io_width = fs_info->sectorsize;
2656 device->io_align = fs_info->sectorsize;
2657 device->sector_size = fs_info->sectorsize;
2658 device->total_bytes =
2659 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
2660 device->disk_total_bytes = device->total_bytes;
2661 device->commit_total_bytes = device->total_bytes;
2662 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2663 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2664 device->mode = FMODE_EXCL;
2665 device->dev_stats_valid = 1;
2666 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2667
2668 if (seeding_dev) {
2669 btrfs_clear_sb_rdonly(sb);
2670
2671
2672 seed_devices = btrfs_init_sprout(fs_info);
2673 if (IS_ERR(seed_devices)) {
2674 ret = PTR_ERR(seed_devices);
2675 btrfs_abort_transaction(trans, ret);
2676 goto error_trans;
2677 }
2678 }
2679
2680 mutex_lock(&fs_devices->device_list_mutex);
2681 if (seeding_dev) {
2682 btrfs_setup_sprout(fs_info, seed_devices);
2683 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2684 device);
2685 }
2686
2687 device->fs_devices = fs_devices;
2688
2689 mutex_lock(&fs_info->chunk_mutex);
2690 list_add_rcu(&device->dev_list, &fs_devices->devices);
2691 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2692 fs_devices->num_devices++;
2693 fs_devices->open_devices++;
2694 fs_devices->rw_devices++;
2695 fs_devices->total_devices++;
2696 fs_devices->total_rw_bytes += device->total_bytes;
2697
2698 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2699
2700 if (!bdev_nonrot(bdev))
2701 fs_devices->rotating = true;
2702
2703 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2704 btrfs_set_super_total_bytes(fs_info->super_copy,
2705 round_down(orig_super_total_bytes + device->total_bytes,
2706 fs_info->sectorsize));
2707
2708 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2709 btrfs_set_super_num_devices(fs_info->super_copy,
2710 orig_super_num_devices + 1);
2711
2712
2713
2714
2715
2716 btrfs_clear_space_info_full(fs_info);
2717
2718 mutex_unlock(&fs_info->chunk_mutex);
2719
2720
2721 btrfs_sysfs_add_device(device);
2722
2723 mutex_unlock(&fs_devices->device_list_mutex);
2724
2725 if (seeding_dev) {
2726 mutex_lock(&fs_info->chunk_mutex);
2727 ret = init_first_rw_device(trans);
2728 mutex_unlock(&fs_info->chunk_mutex);
2729 if (ret) {
2730 btrfs_abort_transaction(trans, ret);
2731 goto error_sysfs;
2732 }
2733 }
2734
2735 ret = btrfs_add_dev_item(trans, device);
2736 if (ret) {
2737 btrfs_abort_transaction(trans, ret);
2738 goto error_sysfs;
2739 }
2740
2741 if (seeding_dev) {
2742 ret = btrfs_finish_sprout(trans);
2743 if (ret) {
2744 btrfs_abort_transaction(trans, ret);
2745 goto error_sysfs;
2746 }
2747
2748
2749
2750
2751
2752 btrfs_sysfs_update_sprout_fsid(fs_devices);
2753 }
2754
2755 ret = btrfs_commit_transaction(trans);
2756
2757 if (seeding_dev) {
2758 mutex_unlock(&uuid_mutex);
2759 up_write(&sb->s_umount);
2760 locked = false;
2761
2762 if (ret)
2763 return ret;
2764
2765 ret = btrfs_relocate_sys_chunks(fs_info);
2766 if (ret < 0)
2767 btrfs_handle_fs_error(fs_info, ret,
2768 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2769 trans = btrfs_attach_transaction(root);
2770 if (IS_ERR(trans)) {
2771 if (PTR_ERR(trans) == -ENOENT)
2772 return 0;
2773 ret = PTR_ERR(trans);
2774 trans = NULL;
2775 goto error_sysfs;
2776 }
2777 ret = btrfs_commit_transaction(trans);
2778 }
2779
2780
2781
2782
2783
2784
2785
2786
2787 btrfs_forget_devices(device->devt);
2788
2789
2790 update_dev_time(device_path);
2791
2792 return ret;
2793
2794 error_sysfs:
2795 btrfs_sysfs_remove_device(device);
2796 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2797 mutex_lock(&fs_info->chunk_mutex);
2798 list_del_rcu(&device->dev_list);
2799 list_del(&device->dev_alloc_list);
2800 fs_info->fs_devices->num_devices--;
2801 fs_info->fs_devices->open_devices--;
2802 fs_info->fs_devices->rw_devices--;
2803 fs_info->fs_devices->total_devices--;
2804 fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2805 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2806 btrfs_set_super_total_bytes(fs_info->super_copy,
2807 orig_super_total_bytes);
2808 btrfs_set_super_num_devices(fs_info->super_copy,
2809 orig_super_num_devices);
2810 mutex_unlock(&fs_info->chunk_mutex);
2811 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2812 error_trans:
2813 if (seeding_dev)
2814 btrfs_set_sb_rdonly(sb);
2815 if (trans)
2816 btrfs_end_transaction(trans);
2817 error_free_zone:
2818 btrfs_destroy_dev_zone_info(device);
2819 error_free_device:
2820 btrfs_free_device(device);
2821 error:
2822 blkdev_put(bdev, FMODE_EXCL);
2823 if (locked) {
2824 mutex_unlock(&uuid_mutex);
2825 up_write(&sb->s_umount);
2826 }
2827 return ret;
2828 }
2829
2830 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2831 struct btrfs_device *device)
2832 {
2833 int ret;
2834 struct btrfs_path *path;
2835 struct btrfs_root *root = device->fs_info->chunk_root;
2836 struct btrfs_dev_item *dev_item;
2837 struct extent_buffer *leaf;
2838 struct btrfs_key key;
2839
2840 path = btrfs_alloc_path();
2841 if (!path)
2842 return -ENOMEM;
2843
2844 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2845 key.type = BTRFS_DEV_ITEM_KEY;
2846 key.offset = device->devid;
2847
2848 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2849 if (ret < 0)
2850 goto out;
2851
2852 if (ret > 0) {
2853 ret = -ENOENT;
2854 goto out;
2855 }
2856
2857 leaf = path->nodes[0];
2858 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2859
2860 btrfs_set_device_id(leaf, dev_item, device->devid);
2861 btrfs_set_device_type(leaf, dev_item, device->type);
2862 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2863 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2864 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2865 btrfs_set_device_total_bytes(leaf, dev_item,
2866 btrfs_device_get_disk_total_bytes(device));
2867 btrfs_set_device_bytes_used(leaf, dev_item,
2868 btrfs_device_get_bytes_used(device));
2869 btrfs_mark_buffer_dirty(leaf);
2870
2871 out:
2872 btrfs_free_path(path);
2873 return ret;
2874 }
2875
2876 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2877 struct btrfs_device *device, u64 new_size)
2878 {
2879 struct btrfs_fs_info *fs_info = device->fs_info;
2880 struct btrfs_super_block *super_copy = fs_info->super_copy;
2881 u64 old_total;
2882 u64 diff;
2883 int ret;
2884
2885 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2886 return -EACCES;
2887
2888 new_size = round_down(new_size, fs_info->sectorsize);
2889
2890 mutex_lock(&fs_info->chunk_mutex);
2891 old_total = btrfs_super_total_bytes(super_copy);
2892 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2893
2894 if (new_size <= device->total_bytes ||
2895 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2896 mutex_unlock(&fs_info->chunk_mutex);
2897 return -EINVAL;
2898 }
2899
2900 btrfs_set_super_total_bytes(super_copy,
2901 round_down(old_total + diff, fs_info->sectorsize));
2902 device->fs_devices->total_rw_bytes += diff;
2903
2904 btrfs_device_set_total_bytes(device, new_size);
2905 btrfs_device_set_disk_total_bytes(device, new_size);
2906 btrfs_clear_space_info_full(device->fs_info);
2907 if (list_empty(&device->post_commit_list))
2908 list_add_tail(&device->post_commit_list,
2909 &trans->transaction->dev_update_list);
2910 mutex_unlock(&fs_info->chunk_mutex);
2911
2912 btrfs_reserve_chunk_metadata(trans, false);
2913 ret = btrfs_update_device(trans, device);
2914 btrfs_trans_release_chunk_metadata(trans);
2915
2916 return ret;
2917 }
2918
2919 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2920 {
2921 struct btrfs_fs_info *fs_info = trans->fs_info;
2922 struct btrfs_root *root = fs_info->chunk_root;
2923 int ret;
2924 struct btrfs_path *path;
2925 struct btrfs_key key;
2926
2927 path = btrfs_alloc_path();
2928 if (!path)
2929 return -ENOMEM;
2930
2931 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2932 key.offset = chunk_offset;
2933 key.type = BTRFS_CHUNK_ITEM_KEY;
2934
2935 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2936 if (ret < 0)
2937 goto out;
2938 else if (ret > 0) {
2939 btrfs_handle_fs_error(fs_info, -ENOENT,
2940 "Failed lookup while freeing chunk.");
2941 ret = -ENOENT;
2942 goto out;
2943 }
2944
2945 ret = btrfs_del_item(trans, root, path);
2946 if (ret < 0)
2947 btrfs_handle_fs_error(fs_info, ret,
2948 "Failed to delete chunk item.");
2949 out:
2950 btrfs_free_path(path);
2951 return ret;
2952 }
2953
2954 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2955 {
2956 struct btrfs_super_block *super_copy = fs_info->super_copy;
2957 struct btrfs_disk_key *disk_key;
2958 struct btrfs_chunk *chunk;
2959 u8 *ptr;
2960 int ret = 0;
2961 u32 num_stripes;
2962 u32 array_size;
2963 u32 len = 0;
2964 u32 cur;
2965 struct btrfs_key key;
2966
2967 lockdep_assert_held(&fs_info->chunk_mutex);
2968 array_size = btrfs_super_sys_array_size(super_copy);
2969
2970 ptr = super_copy->sys_chunk_array;
2971 cur = 0;
2972
2973 while (cur < array_size) {
2974 disk_key = (struct btrfs_disk_key *)ptr;
2975 btrfs_disk_key_to_cpu(&key, disk_key);
2976
2977 len = sizeof(*disk_key);
2978
2979 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2980 chunk = (struct btrfs_chunk *)(ptr + len);
2981 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2982 len += btrfs_chunk_item_size(num_stripes);
2983 } else {
2984 ret = -EIO;
2985 break;
2986 }
2987 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2988 key.offset == chunk_offset) {
2989 memmove(ptr, ptr + len, array_size - (cur + len));
2990 array_size -= len;
2991 btrfs_set_super_sys_array_size(super_copy, array_size);
2992 } else {
2993 ptr += len;
2994 cur += len;
2995 }
2996 }
2997 return ret;
2998 }
2999
3000
3001
3002
3003
3004
3005
3006
3007 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
3008 u64 logical, u64 length)
3009 {
3010 struct extent_map_tree *em_tree;
3011 struct extent_map *em;
3012
3013 em_tree = &fs_info->mapping_tree;
3014 read_lock(&em_tree->lock);
3015 em = lookup_extent_mapping(em_tree, logical, length);
3016 read_unlock(&em_tree->lock);
3017
3018 if (!em) {
3019 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
3020 logical, length);
3021 return ERR_PTR(-EINVAL);
3022 }
3023
3024 if (em->start > logical || em->start + em->len < logical) {
3025 btrfs_crit(fs_info,
3026 "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3027 logical, length, em->start, em->start + em->len);
3028 free_extent_map(em);
3029 return ERR_PTR(-EINVAL);
3030 }
3031
3032
3033 return em;
3034 }
3035
3036 static int remove_chunk_item(struct btrfs_trans_handle *trans,
3037 struct map_lookup *map, u64 chunk_offset)
3038 {
3039 int i;
3040
3041
3042
3043
3044
3045
3046 lockdep_assert_held(&trans->fs_info->chunk_mutex);
3047
3048 for (i = 0; i < map->num_stripes; i++) {
3049 int ret;
3050
3051 ret = btrfs_update_device(trans, map->stripes[i].dev);
3052 if (ret)
3053 return ret;
3054 }
3055
3056 return btrfs_free_chunk(trans, chunk_offset);
3057 }
3058
3059 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3060 {
3061 struct btrfs_fs_info *fs_info = trans->fs_info;
3062 struct extent_map *em;
3063 struct map_lookup *map;
3064 u64 dev_extent_len = 0;
3065 int i, ret = 0;
3066 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3067
3068 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3069 if (IS_ERR(em)) {
3070
3071
3072
3073
3074
3075 ASSERT(0);
3076 return PTR_ERR(em);
3077 }
3078 map = em->map_lookup;
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090 mutex_lock(&fs_devices->device_list_mutex);
3091 for (i = 0; i < map->num_stripes; i++) {
3092 struct btrfs_device *device = map->stripes[i].dev;
3093 ret = btrfs_free_dev_extent(trans, device,
3094 map->stripes[i].physical,
3095 &dev_extent_len);
3096 if (ret) {
3097 mutex_unlock(&fs_devices->device_list_mutex);
3098 btrfs_abort_transaction(trans, ret);
3099 goto out;
3100 }
3101
3102 if (device->bytes_used > 0) {
3103 mutex_lock(&fs_info->chunk_mutex);
3104 btrfs_device_set_bytes_used(device,
3105 device->bytes_used - dev_extent_len);
3106 atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3107 btrfs_clear_space_info_full(fs_info);
3108 mutex_unlock(&fs_info->chunk_mutex);
3109 }
3110 }
3111 mutex_unlock(&fs_devices->device_list_mutex);
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134 trans->removing_chunk = true;
3135 mutex_lock(&fs_info->chunk_mutex);
3136
3137 check_system_chunk(trans, map->type);
3138
3139 ret = remove_chunk_item(trans, map, chunk_offset);
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154 if (ret == -ENOSPC) {
3155 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3156 struct btrfs_block_group *sys_bg;
3157
3158 sys_bg = btrfs_create_chunk(trans, sys_flags);
3159 if (IS_ERR(sys_bg)) {
3160 ret = PTR_ERR(sys_bg);
3161 btrfs_abort_transaction(trans, ret);
3162 goto out;
3163 }
3164
3165 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3166 if (ret) {
3167 btrfs_abort_transaction(trans, ret);
3168 goto out;
3169 }
3170
3171 ret = remove_chunk_item(trans, map, chunk_offset);
3172 if (ret) {
3173 btrfs_abort_transaction(trans, ret);
3174 goto out;
3175 }
3176 } else if (ret) {
3177 btrfs_abort_transaction(trans, ret);
3178 goto out;
3179 }
3180
3181 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3182
3183 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3184 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3185 if (ret) {
3186 btrfs_abort_transaction(trans, ret);
3187 goto out;
3188 }
3189 }
3190
3191 mutex_unlock(&fs_info->chunk_mutex);
3192 trans->removing_chunk = false;
3193
3194
3195
3196
3197
3198 btrfs_trans_release_chunk_metadata(trans);
3199
3200 ret = btrfs_remove_block_group(trans, chunk_offset, em);
3201 if (ret) {
3202 btrfs_abort_transaction(trans, ret);
3203 goto out;
3204 }
3205
3206 out:
3207 if (trans->removing_chunk) {
3208 mutex_unlock(&fs_info->chunk_mutex);
3209 trans->removing_chunk = false;
3210 }
3211
3212 free_extent_map(em);
3213 return ret;
3214 }
3215
3216 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3217 {
3218 struct btrfs_root *root = fs_info->chunk_root;
3219 struct btrfs_trans_handle *trans;
3220 struct btrfs_block_group *block_group;
3221 u64 length;
3222 int ret;
3223
3224 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3225 btrfs_err(fs_info,
3226 "relocate: not supported on extent tree v2 yet");
3227 return -EINVAL;
3228 }
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242 lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3243
3244
3245 btrfs_scrub_pause(fs_info);
3246 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3247 btrfs_scrub_continue(fs_info);
3248 if (ret)
3249 return ret;
3250
3251 block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3252 if (!block_group)
3253 return -ENOENT;
3254 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3255 length = block_group->length;
3256 btrfs_put_block_group(block_group);
3257
3258
3259
3260
3261
3262
3263
3264 if (btrfs_is_zoned(fs_info)) {
3265 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
3266 if (ret)
3267 btrfs_info(fs_info,
3268 "failed to reset zone %llu after relocation",
3269 chunk_offset);
3270 }
3271
3272 trans = btrfs_start_trans_remove_block_group(root->fs_info,
3273 chunk_offset);
3274 if (IS_ERR(trans)) {
3275 ret = PTR_ERR(trans);
3276 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3277 return ret;
3278 }
3279
3280
3281
3282
3283
3284 ret = btrfs_remove_chunk(trans, chunk_offset);
3285 btrfs_end_transaction(trans);
3286 return ret;
3287 }
3288
3289 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3290 {
3291 struct btrfs_root *chunk_root = fs_info->chunk_root;
3292 struct btrfs_path *path;
3293 struct extent_buffer *leaf;
3294 struct btrfs_chunk *chunk;
3295 struct btrfs_key key;
3296 struct btrfs_key found_key;
3297 u64 chunk_type;
3298 bool retried = false;
3299 int failed = 0;
3300 int ret;
3301
3302 path = btrfs_alloc_path();
3303 if (!path)
3304 return -ENOMEM;
3305
3306 again:
3307 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3308 key.offset = (u64)-1;
3309 key.type = BTRFS_CHUNK_ITEM_KEY;
3310
3311 while (1) {
3312 mutex_lock(&fs_info->reclaim_bgs_lock);
3313 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3314 if (ret < 0) {
3315 mutex_unlock(&fs_info->reclaim_bgs_lock);
3316 goto error;
3317 }
3318 BUG_ON(ret == 0);
3319
3320 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3321 key.type);
3322 if (ret)
3323 mutex_unlock(&fs_info->reclaim_bgs_lock);
3324 if (ret < 0)
3325 goto error;
3326 if (ret > 0)
3327 break;
3328
3329 leaf = path->nodes[0];
3330 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3331
3332 chunk = btrfs_item_ptr(leaf, path->slots[0],
3333 struct btrfs_chunk);
3334 chunk_type = btrfs_chunk_type(leaf, chunk);
3335 btrfs_release_path(path);
3336
3337 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3338 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3339 if (ret == -ENOSPC)
3340 failed++;
3341 else
3342 BUG_ON(ret);
3343 }
3344 mutex_unlock(&fs_info->reclaim_bgs_lock);
3345
3346 if (found_key.offset == 0)
3347 break;
3348 key.offset = found_key.offset - 1;
3349 }
3350 ret = 0;
3351 if (failed && !retried) {
3352 failed = 0;
3353 retried = true;
3354 goto again;
3355 } else if (WARN_ON(failed && retried)) {
3356 ret = -ENOSPC;
3357 }
3358 error:
3359 btrfs_free_path(path);
3360 return ret;
3361 }
3362
3363
3364
3365
3366
3367
3368 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3369 u64 chunk_offset)
3370 {
3371 struct btrfs_block_group *cache;
3372 u64 bytes_used;
3373 u64 chunk_type;
3374
3375 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3376 ASSERT(cache);
3377 chunk_type = cache->flags;
3378 btrfs_put_block_group(cache);
3379
3380 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3381 return 0;
3382
3383 spin_lock(&fs_info->data_sinfo->lock);
3384 bytes_used = fs_info->data_sinfo->bytes_used;
3385 spin_unlock(&fs_info->data_sinfo->lock);
3386
3387 if (!bytes_used) {
3388 struct btrfs_trans_handle *trans;
3389 int ret;
3390
3391 trans = btrfs_join_transaction(fs_info->tree_root);
3392 if (IS_ERR(trans))
3393 return PTR_ERR(trans);
3394
3395 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3396 btrfs_end_transaction(trans);
3397 if (ret < 0)
3398 return ret;
3399 return 1;
3400 }
3401
3402 return 0;
3403 }
3404
3405 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3406 struct btrfs_balance_control *bctl)
3407 {
3408 struct btrfs_root *root = fs_info->tree_root;
3409 struct btrfs_trans_handle *trans;
3410 struct btrfs_balance_item *item;
3411 struct btrfs_disk_balance_args disk_bargs;
3412 struct btrfs_path *path;
3413 struct extent_buffer *leaf;
3414 struct btrfs_key key;
3415 int ret, err;
3416
3417 path = btrfs_alloc_path();
3418 if (!path)
3419 return -ENOMEM;
3420
3421 trans = btrfs_start_transaction(root, 0);
3422 if (IS_ERR(trans)) {
3423 btrfs_free_path(path);
3424 return PTR_ERR(trans);
3425 }
3426
3427 key.objectid = BTRFS_BALANCE_OBJECTID;
3428 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3429 key.offset = 0;
3430
3431 ret = btrfs_insert_empty_item(trans, root, path, &key,
3432 sizeof(*item));
3433 if (ret)
3434 goto out;
3435
3436 leaf = path->nodes[0];
3437 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3438
3439 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3440
3441 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3442 btrfs_set_balance_data(leaf, item, &disk_bargs);
3443 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3444 btrfs_set_balance_meta(leaf, item, &disk_bargs);
3445 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3446 btrfs_set_balance_sys(leaf, item, &disk_bargs);
3447
3448 btrfs_set_balance_flags(leaf, item, bctl->flags);
3449
3450 btrfs_mark_buffer_dirty(leaf);
3451 out:
3452 btrfs_free_path(path);
3453 err = btrfs_commit_transaction(trans);
3454 if (err && !ret)
3455 ret = err;
3456 return ret;
3457 }
3458
3459 static int del_balance_item(struct btrfs_fs_info *fs_info)
3460 {
3461 struct btrfs_root *root = fs_info->tree_root;
3462 struct btrfs_trans_handle *trans;
3463 struct btrfs_path *path;
3464 struct btrfs_key key;
3465 int ret, err;
3466
3467 path = btrfs_alloc_path();
3468 if (!path)
3469 return -ENOMEM;
3470
3471 trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3472 if (IS_ERR(trans)) {
3473 btrfs_free_path(path);
3474 return PTR_ERR(trans);
3475 }
3476
3477 key.objectid = BTRFS_BALANCE_OBJECTID;
3478 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3479 key.offset = 0;
3480
3481 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3482 if (ret < 0)
3483 goto out;
3484 if (ret > 0) {
3485 ret = -ENOENT;
3486 goto out;
3487 }
3488
3489 ret = btrfs_del_item(trans, root, path);
3490 out:
3491 btrfs_free_path(path);
3492 err = btrfs_commit_transaction(trans);
3493 if (err && !ret)
3494 ret = err;
3495 return ret;
3496 }
3497
3498
3499
3500
3501
3502 static void update_balance_args(struct btrfs_balance_control *bctl)
3503 {
3504
3505
3506
3507 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3508 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3509 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3510 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3511 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3512 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3513
3514
3515
3516
3517
3518
3519
3520
3521 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3522 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3523 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3524 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3525 bctl->data.usage = 90;
3526 }
3527 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3528 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3529 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3530 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3531 bctl->sys.usage = 90;
3532 }
3533 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3534 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3535 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3536 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3537 bctl->meta.usage = 90;
3538 }
3539 }
3540
3541
3542
3543
3544 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3545 {
3546 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3547 int ret;
3548
3549 BUG_ON(!fs_info->balance_ctl);
3550
3551 spin_lock(&fs_info->balance_lock);
3552 fs_info->balance_ctl = NULL;
3553 spin_unlock(&fs_info->balance_lock);
3554
3555 kfree(bctl);
3556 ret = del_balance_item(fs_info);
3557 if (ret)
3558 btrfs_handle_fs_error(fs_info, ret, NULL);
3559 }
3560
3561
3562
3563
3564
3565 static int chunk_profiles_filter(u64 chunk_type,
3566 struct btrfs_balance_args *bargs)
3567 {
3568 chunk_type = chunk_to_extended(chunk_type) &
3569 BTRFS_EXTENDED_PROFILE_MASK;
3570
3571 if (bargs->profiles & chunk_type)
3572 return 0;
3573
3574 return 1;
3575 }
3576
3577 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3578 struct btrfs_balance_args *bargs)
3579 {
3580 struct btrfs_block_group *cache;
3581 u64 chunk_used;
3582 u64 user_thresh_min;
3583 u64 user_thresh_max;
3584 int ret = 1;
3585
3586 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3587 chunk_used = cache->used;
3588
3589 if (bargs->usage_min == 0)
3590 user_thresh_min = 0;
3591 else
3592 user_thresh_min = div_factor_fine(cache->length,
3593 bargs->usage_min);
3594
3595 if (bargs->usage_max == 0)
3596 user_thresh_max = 1;
3597 else if (bargs->usage_max > 100)
3598 user_thresh_max = cache->length;
3599 else
3600 user_thresh_max = div_factor_fine(cache->length,
3601 bargs->usage_max);
3602
3603 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3604 ret = 0;
3605
3606 btrfs_put_block_group(cache);
3607 return ret;
3608 }
3609
3610 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3611 u64 chunk_offset, struct btrfs_balance_args *bargs)
3612 {
3613 struct btrfs_block_group *cache;
3614 u64 chunk_used, user_thresh;
3615 int ret = 1;
3616
3617 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3618 chunk_used = cache->used;
3619
3620 if (bargs->usage_min == 0)
3621 user_thresh = 1;
3622 else if (bargs->usage > 100)
3623 user_thresh = cache->length;
3624 else
3625 user_thresh = div_factor_fine(cache->length, bargs->usage);
3626
3627 if (chunk_used < user_thresh)
3628 ret = 0;
3629
3630 btrfs_put_block_group(cache);
3631 return ret;
3632 }
3633
3634 static int chunk_devid_filter(struct extent_buffer *leaf,
3635 struct btrfs_chunk *chunk,
3636 struct btrfs_balance_args *bargs)
3637 {
3638 struct btrfs_stripe *stripe;
3639 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3640 int i;
3641
3642 for (i = 0; i < num_stripes; i++) {
3643 stripe = btrfs_stripe_nr(chunk, i);
3644 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3645 return 0;
3646 }
3647
3648 return 1;
3649 }
3650
3651 static u64 calc_data_stripes(u64 type, int num_stripes)
3652 {
3653 const int index = btrfs_bg_flags_to_raid_index(type);
3654 const int ncopies = btrfs_raid_array[index].ncopies;
3655 const int nparity = btrfs_raid_array[index].nparity;
3656
3657 return (num_stripes - nparity) / ncopies;
3658 }
3659
3660
3661 static int chunk_drange_filter(struct extent_buffer *leaf,
3662 struct btrfs_chunk *chunk,
3663 struct btrfs_balance_args *bargs)
3664 {
3665 struct btrfs_stripe *stripe;
3666 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3667 u64 stripe_offset;
3668 u64 stripe_length;
3669 u64 type;
3670 int factor;
3671 int i;
3672
3673 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3674 return 0;
3675
3676 type = btrfs_chunk_type(leaf, chunk);
3677 factor = calc_data_stripes(type, num_stripes);
3678
3679 for (i = 0; i < num_stripes; i++) {
3680 stripe = btrfs_stripe_nr(chunk, i);
3681 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3682 continue;
3683
3684 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3685 stripe_length = btrfs_chunk_length(leaf, chunk);
3686 stripe_length = div_u64(stripe_length, factor);
3687
3688 if (stripe_offset < bargs->pend &&
3689 stripe_offset + stripe_length > bargs->pstart)
3690 return 0;
3691 }
3692
3693 return 1;
3694 }
3695
3696
3697 static int chunk_vrange_filter(struct extent_buffer *leaf,
3698 struct btrfs_chunk *chunk,
3699 u64 chunk_offset,
3700 struct btrfs_balance_args *bargs)
3701 {
3702 if (chunk_offset < bargs->vend &&
3703 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3704
3705 return 0;
3706
3707 return 1;
3708 }
3709
3710 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3711 struct btrfs_chunk *chunk,
3712 struct btrfs_balance_args *bargs)
3713 {
3714 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3715
3716 if (bargs->stripes_min <= num_stripes
3717 && num_stripes <= bargs->stripes_max)
3718 return 0;
3719
3720 return 1;
3721 }
3722
3723 static int chunk_soft_convert_filter(u64 chunk_type,
3724 struct btrfs_balance_args *bargs)
3725 {
3726 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3727 return 0;
3728
3729 chunk_type = chunk_to_extended(chunk_type) &
3730 BTRFS_EXTENDED_PROFILE_MASK;
3731
3732 if (bargs->target == chunk_type)
3733 return 1;
3734
3735 return 0;
3736 }
3737
3738 static int should_balance_chunk(struct extent_buffer *leaf,
3739 struct btrfs_chunk *chunk, u64 chunk_offset)
3740 {
3741 struct btrfs_fs_info *fs_info = leaf->fs_info;
3742 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3743 struct btrfs_balance_args *bargs = NULL;
3744 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3745
3746
3747 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3748 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3749 return 0;
3750 }
3751
3752 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3753 bargs = &bctl->data;
3754 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3755 bargs = &bctl->sys;
3756 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3757 bargs = &bctl->meta;
3758
3759
3760 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3761 chunk_profiles_filter(chunk_type, bargs)) {
3762 return 0;
3763 }
3764
3765
3766 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3767 chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3768 return 0;
3769 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3770 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3771 return 0;
3772 }
3773
3774
3775 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3776 chunk_devid_filter(leaf, chunk, bargs)) {
3777 return 0;
3778 }
3779
3780
3781 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3782 chunk_drange_filter(leaf, chunk, bargs)) {
3783 return 0;
3784 }
3785
3786
3787 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3788 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3789 return 0;
3790 }
3791
3792
3793 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3794 chunk_stripes_range_filter(leaf, chunk, bargs)) {
3795 return 0;
3796 }
3797
3798
3799 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3800 chunk_soft_convert_filter(chunk_type, bargs)) {
3801 return 0;
3802 }
3803
3804
3805
3806
3807 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3808 if (bargs->limit == 0)
3809 return 0;
3810 else
3811 bargs->limit--;
3812 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3813
3814
3815
3816
3817
3818 if (bargs->limit_max == 0)
3819 return 0;
3820 else
3821 bargs->limit_max--;
3822 }
3823
3824 return 1;
3825 }
3826
3827 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3828 {
3829 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3830 struct btrfs_root *chunk_root = fs_info->chunk_root;
3831 u64 chunk_type;
3832 struct btrfs_chunk *chunk;
3833 struct btrfs_path *path = NULL;
3834 struct btrfs_key key;
3835 struct btrfs_key found_key;
3836 struct extent_buffer *leaf;
3837 int slot;
3838 int ret;
3839 int enospc_errors = 0;
3840 bool counting = true;
3841
3842 u64 limit_data = bctl->data.limit;
3843 u64 limit_meta = bctl->meta.limit;
3844 u64 limit_sys = bctl->sys.limit;
3845 u32 count_data = 0;
3846 u32 count_meta = 0;
3847 u32 count_sys = 0;
3848 int chunk_reserved = 0;
3849
3850 path = btrfs_alloc_path();
3851 if (!path) {
3852 ret = -ENOMEM;
3853 goto error;
3854 }
3855
3856
3857 spin_lock(&fs_info->balance_lock);
3858 memset(&bctl->stat, 0, sizeof(bctl->stat));
3859 spin_unlock(&fs_info->balance_lock);
3860 again:
3861 if (!counting) {
3862
3863
3864
3865
3866 bctl->data.limit = limit_data;
3867 bctl->meta.limit = limit_meta;
3868 bctl->sys.limit = limit_sys;
3869 }
3870 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3871 key.offset = (u64)-1;
3872 key.type = BTRFS_CHUNK_ITEM_KEY;
3873
3874 while (1) {
3875 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3876 atomic_read(&fs_info->balance_cancel_req)) {
3877 ret = -ECANCELED;
3878 goto error;
3879 }
3880
3881 mutex_lock(&fs_info->reclaim_bgs_lock);
3882 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3883 if (ret < 0) {
3884 mutex_unlock(&fs_info->reclaim_bgs_lock);
3885 goto error;
3886 }
3887
3888
3889
3890
3891
3892 if (ret == 0)
3893 BUG();
3894
3895 ret = btrfs_previous_item(chunk_root, path, 0,
3896 BTRFS_CHUNK_ITEM_KEY);
3897 if (ret) {
3898 mutex_unlock(&fs_info->reclaim_bgs_lock);
3899 ret = 0;
3900 break;
3901 }
3902
3903 leaf = path->nodes[0];
3904 slot = path->slots[0];
3905 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3906
3907 if (found_key.objectid != key.objectid) {
3908 mutex_unlock(&fs_info->reclaim_bgs_lock);
3909 break;
3910 }
3911
3912 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3913 chunk_type = btrfs_chunk_type(leaf, chunk);
3914
3915 if (!counting) {
3916 spin_lock(&fs_info->balance_lock);
3917 bctl->stat.considered++;
3918 spin_unlock(&fs_info->balance_lock);
3919 }
3920
3921 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3922
3923 btrfs_release_path(path);
3924 if (!ret) {
3925 mutex_unlock(&fs_info->reclaim_bgs_lock);
3926 goto loop;
3927 }
3928
3929 if (counting) {
3930 mutex_unlock(&fs_info->reclaim_bgs_lock);
3931 spin_lock(&fs_info->balance_lock);
3932 bctl->stat.expected++;
3933 spin_unlock(&fs_info->balance_lock);
3934
3935 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3936 count_data++;
3937 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3938 count_sys++;
3939 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3940 count_meta++;
3941
3942 goto loop;
3943 }
3944
3945
3946
3947
3948
3949 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3950 count_data < bctl->data.limit_min)
3951 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3952 count_meta < bctl->meta.limit_min)
3953 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3954 count_sys < bctl->sys.limit_min)) {
3955 mutex_unlock(&fs_info->reclaim_bgs_lock);
3956 goto loop;
3957 }
3958
3959 if (!chunk_reserved) {
3960
3961
3962
3963
3964
3965
3966 ret = btrfs_may_alloc_data_chunk(fs_info,
3967 found_key.offset);
3968 if (ret < 0) {
3969 mutex_unlock(&fs_info->reclaim_bgs_lock);
3970 goto error;
3971 } else if (ret == 1) {
3972 chunk_reserved = 1;
3973 }
3974 }
3975
3976 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3977 mutex_unlock(&fs_info->reclaim_bgs_lock);
3978 if (ret == -ENOSPC) {
3979 enospc_errors++;
3980 } else if (ret == -ETXTBSY) {
3981 btrfs_info(fs_info,
3982 "skipping relocation of block group %llu due to active swapfile",
3983 found_key.offset);
3984 ret = 0;
3985 } else if (ret) {
3986 goto error;
3987 } else {
3988 spin_lock(&fs_info->balance_lock);
3989 bctl->stat.completed++;
3990 spin_unlock(&fs_info->balance_lock);
3991 }
3992 loop:
3993 if (found_key.offset == 0)
3994 break;
3995 key.offset = found_key.offset - 1;
3996 }
3997
3998 if (counting) {
3999 btrfs_release_path(path);
4000 counting = false;
4001 goto again;
4002 }
4003 error:
4004 btrfs_free_path(path);
4005 if (enospc_errors) {
4006 btrfs_info(fs_info, "%d enospc errors during balance",
4007 enospc_errors);
4008 if (!ret)
4009 ret = -ENOSPC;
4010 }
4011
4012 return ret;
4013 }
4014
4015
4016
4017
4018
4019
4020 static int alloc_profile_is_valid(u64 flags, int extended)
4021 {
4022 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
4023 BTRFS_BLOCK_GROUP_PROFILE_MASK);
4024
4025 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
4026
4027
4028 if (flags & ~mask)
4029 return 0;
4030
4031
4032 if (flags == 0)
4033 return !extended;
4034
4035 return has_single_bit_set(flags);
4036 }
4037
4038 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
4039 {
4040
4041 return atomic_read(&fs_info->balance_cancel_req) ||
4042 (atomic_read(&fs_info->balance_pause_req) == 0 &&
4043 atomic_read(&fs_info->balance_cancel_req) == 0);
4044 }
4045
4046
4047
4048
4049
4050 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
4051 const struct btrfs_balance_args *bargs,
4052 u64 allowed, const char *type)
4053 {
4054 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
4055 return true;
4056
4057
4058 if (alloc_profile_is_valid(bargs->target, 1) &&
4059 (bargs->target & ~allowed) == 0)
4060 return true;
4061
4062 btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4063 type, btrfs_bg_type_to_raid_name(bargs->target));
4064 return false;
4065 }
4066
4067
4068
4069
4070
4071
4072 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
4073 u32 size_buf)
4074 {
4075 int ret;
4076 u32 size_bp = size_buf;
4077 char *bp = buf;
4078 u64 flags = bargs->flags;
4079 char tmp_buf[128] = {'\0'};
4080
4081 if (!flags)
4082 return;
4083
4084 #define CHECK_APPEND_NOARG(a) \
4085 do { \
4086 ret = snprintf(bp, size_bp, (a)); \
4087 if (ret < 0 || ret >= size_bp) \
4088 goto out_overflow; \
4089 size_bp -= ret; \
4090 bp += ret; \
4091 } while (0)
4092
4093 #define CHECK_APPEND_1ARG(a, v1) \
4094 do { \
4095 ret = snprintf(bp, size_bp, (a), (v1)); \
4096 if (ret < 0 || ret >= size_bp) \
4097 goto out_overflow; \
4098 size_bp -= ret; \
4099 bp += ret; \
4100 } while (0)
4101
4102 #define CHECK_APPEND_2ARG(a, v1, v2) \
4103 do { \
4104 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
4105 if (ret < 0 || ret >= size_bp) \
4106 goto out_overflow; \
4107 size_bp -= ret; \
4108 bp += ret; \
4109 } while (0)
4110
4111 if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4112 CHECK_APPEND_1ARG("convert=%s,",
4113 btrfs_bg_type_to_raid_name(bargs->target));
4114
4115 if (flags & BTRFS_BALANCE_ARGS_SOFT)
4116 CHECK_APPEND_NOARG("soft,");
4117
4118 if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4119 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
4120 sizeof(tmp_buf));
4121 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4122 }
4123
4124 if (flags & BTRFS_BALANCE_ARGS_USAGE)
4125 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4126
4127 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4128 CHECK_APPEND_2ARG("usage=%u..%u,",
4129 bargs->usage_min, bargs->usage_max);
4130
4131 if (flags & BTRFS_BALANCE_ARGS_DEVID)
4132 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4133
4134 if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4135 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4136 bargs->pstart, bargs->pend);
4137
4138 if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4139 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4140 bargs->vstart, bargs->vend);
4141
4142 if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4143 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4144
4145 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4146 CHECK_APPEND_2ARG("limit=%u..%u,",
4147 bargs->limit_min, bargs->limit_max);
4148
4149 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4150 CHECK_APPEND_2ARG("stripes=%u..%u,",
4151 bargs->stripes_min, bargs->stripes_max);
4152
4153 #undef CHECK_APPEND_2ARG
4154 #undef CHECK_APPEND_1ARG
4155 #undef CHECK_APPEND_NOARG
4156
4157 out_overflow:
4158
4159 if (size_bp < size_buf)
4160 buf[size_buf - size_bp - 1] = '\0';
4161 else
4162 buf[0] = '\0';
4163 }
4164
4165 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4166 {
4167 u32 size_buf = 1024;
4168 char tmp_buf[192] = {'\0'};
4169 char *buf;
4170 char *bp;
4171 u32 size_bp = size_buf;
4172 int ret;
4173 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4174
4175 buf = kzalloc(size_buf, GFP_KERNEL);
4176 if (!buf)
4177 return;
4178
4179 bp = buf;
4180
4181 #define CHECK_APPEND_1ARG(a, v1) \
4182 do { \
4183 ret = snprintf(bp, size_bp, (a), (v1)); \
4184 if (ret < 0 || ret >= size_bp) \
4185 goto out_overflow; \
4186 size_bp -= ret; \
4187 bp += ret; \
4188 } while (0)
4189
4190 if (bctl->flags & BTRFS_BALANCE_FORCE)
4191 CHECK_APPEND_1ARG("%s", "-f ");
4192
4193 if (bctl->flags & BTRFS_BALANCE_DATA) {
4194 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4195 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4196 }
4197
4198 if (bctl->flags & BTRFS_BALANCE_METADATA) {
4199 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4200 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4201 }
4202
4203 if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4204 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4205 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4206 }
4207
4208 #undef CHECK_APPEND_1ARG
4209
4210 out_overflow:
4211
4212 if (size_bp < size_buf)
4213 buf[size_buf - size_bp - 1] = '\0';
4214 btrfs_info(fs_info, "balance: %s %s",
4215 (bctl->flags & BTRFS_BALANCE_RESUME) ?
4216 "resume" : "start", buf);
4217
4218 kfree(buf);
4219 }
4220
4221
4222
4223
4224 int btrfs_balance(struct btrfs_fs_info *fs_info,
4225 struct btrfs_balance_control *bctl,
4226 struct btrfs_ioctl_balance_args *bargs)
4227 {
4228 u64 meta_target, data_target;
4229 u64 allowed;
4230 int mixed = 0;
4231 int ret;
4232 u64 num_devices;
4233 unsigned seq;
4234 bool reducing_redundancy;
4235 int i;
4236
4237 if (btrfs_fs_closing(fs_info) ||
4238 atomic_read(&fs_info->balance_pause_req) ||
4239 btrfs_should_cancel_balance(fs_info)) {
4240 ret = -EINVAL;
4241 goto out;
4242 }
4243
4244 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4245 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4246 mixed = 1;
4247
4248
4249
4250
4251
4252 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4253 if (mixed && (bctl->flags & allowed)) {
4254 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4255 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4256 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4257 btrfs_err(fs_info,
4258 "balance: mixed groups data and metadata options must be the same");
4259 ret = -EINVAL;
4260 goto out;
4261 }
4262 }
4263
4264
4265
4266
4267
4268 num_devices = fs_info->fs_devices->rw_devices;
4269
4270
4271
4272
4273
4274
4275 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4276 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4277 if (num_devices >= btrfs_raid_array[i].devs_min)
4278 allowed |= btrfs_raid_array[i].bg_flag;
4279
4280 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4281 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4282 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
4283 ret = -EINVAL;
4284 goto out;
4285 }
4286
4287
4288
4289
4290
4291 allowed = 0;
4292 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4293 if (btrfs_raid_array[i].ncopies >= 2 ||
4294 btrfs_raid_array[i].tolerated_failures >= 1)
4295 allowed |= btrfs_raid_array[i].bg_flag;
4296 }
4297 do {
4298 seq = read_seqbegin(&fs_info->profiles_lock);
4299
4300 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4301 (fs_info->avail_system_alloc_bits & allowed) &&
4302 !(bctl->sys.target & allowed)) ||
4303 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4304 (fs_info->avail_metadata_alloc_bits & allowed) &&
4305 !(bctl->meta.target & allowed)))
4306 reducing_redundancy = true;
4307 else
4308 reducing_redundancy = false;
4309
4310
4311 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4312 bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4313 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4314 bctl->data.target : fs_info->avail_data_alloc_bits;
4315 } while (read_seqretry(&fs_info->profiles_lock, seq));
4316
4317 if (reducing_redundancy) {
4318 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4319 btrfs_info(fs_info,
4320 "balance: force reducing metadata redundancy");
4321 } else {
4322 btrfs_err(fs_info,
4323 "balance: reduces metadata redundancy, use --force if you want this");
4324 ret = -EINVAL;
4325 goto out;
4326 }
4327 }
4328
4329 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4330 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4331 btrfs_warn(fs_info,
4332 "balance: metadata profile %s has lower redundancy than data profile %s",
4333 btrfs_bg_type_to_raid_name(meta_target),
4334 btrfs_bg_type_to_raid_name(data_target));
4335 }
4336
4337 ret = insert_balance_item(fs_info, bctl);
4338 if (ret && ret != -EEXIST)
4339 goto out;
4340
4341 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4342 BUG_ON(ret == -EEXIST);
4343 BUG_ON(fs_info->balance_ctl);
4344 spin_lock(&fs_info->balance_lock);
4345 fs_info->balance_ctl = bctl;
4346 spin_unlock(&fs_info->balance_lock);
4347 } else {
4348 BUG_ON(ret != -EEXIST);
4349 spin_lock(&fs_info->balance_lock);
4350 update_balance_args(bctl);
4351 spin_unlock(&fs_info->balance_lock);
4352 }
4353
4354 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4355 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4356 describe_balance_start_or_resume(fs_info);
4357 mutex_unlock(&fs_info->balance_mutex);
4358
4359 ret = __btrfs_balance(fs_info);
4360
4361 mutex_lock(&fs_info->balance_mutex);
4362 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
4363 btrfs_info(fs_info, "balance: paused");
4364 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4365 }
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381 else if (ret == -ECANCELED || ret == -EINTR)
4382 btrfs_info(fs_info, "balance: canceled");
4383 else
4384 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4385
4386 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4387
4388 if (bargs) {
4389 memset(bargs, 0, sizeof(*bargs));
4390 btrfs_update_ioctl_balance_args(fs_info, bargs);
4391 }
4392
4393 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4394 balance_need_close(fs_info)) {
4395 reset_balance_state(fs_info);
4396 btrfs_exclop_finish(fs_info);
4397 }
4398
4399 wake_up(&fs_info->balance_wait_q);
4400
4401 return ret;
4402 out:
4403 if (bctl->flags & BTRFS_BALANCE_RESUME)
4404 reset_balance_state(fs_info);
4405 else
4406 kfree(bctl);
4407 btrfs_exclop_finish(fs_info);
4408
4409 return ret;
4410 }
4411
4412 static int balance_kthread(void *data)
4413 {
4414 struct btrfs_fs_info *fs_info = data;
4415 int ret = 0;
4416
4417 sb_start_write(fs_info->sb);
4418 mutex_lock(&fs_info->balance_mutex);
4419 if (fs_info->balance_ctl)
4420 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4421 mutex_unlock(&fs_info->balance_mutex);
4422 sb_end_write(fs_info->sb);
4423
4424 return ret;
4425 }
4426
4427 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4428 {
4429 struct task_struct *tsk;
4430
4431 mutex_lock(&fs_info->balance_mutex);
4432 if (!fs_info->balance_ctl) {
4433 mutex_unlock(&fs_info->balance_mutex);
4434 return 0;
4435 }
4436 mutex_unlock(&fs_info->balance_mutex);
4437
4438 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4439 btrfs_info(fs_info, "balance: resume skipped");
4440 return 0;
4441 }
4442
4443 spin_lock(&fs_info->super_lock);
4444 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4445 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4446 spin_unlock(&fs_info->super_lock);
4447
4448
4449
4450
4451
4452 spin_lock(&fs_info->balance_lock);
4453 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4454 spin_unlock(&fs_info->balance_lock);
4455
4456 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4457 return PTR_ERR_OR_ZERO(tsk);
4458 }
4459
4460 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4461 {
4462 struct btrfs_balance_control *bctl;
4463 struct btrfs_balance_item *item;
4464 struct btrfs_disk_balance_args disk_bargs;
4465 struct btrfs_path *path;
4466 struct extent_buffer *leaf;
4467 struct btrfs_key key;
4468 int ret;
4469
4470 path = btrfs_alloc_path();
4471 if (!path)
4472 return -ENOMEM;
4473
4474 key.objectid = BTRFS_BALANCE_OBJECTID;
4475 key.type = BTRFS_TEMPORARY_ITEM_KEY;
4476 key.offset = 0;
4477
4478 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4479 if (ret < 0)
4480 goto out;
4481 if (ret > 0) {
4482 ret = 0;
4483 goto out;
4484 }
4485
4486 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4487 if (!bctl) {
4488 ret = -ENOMEM;
4489 goto out;
4490 }
4491
4492 leaf = path->nodes[0];
4493 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4494
4495 bctl->flags = btrfs_balance_flags(leaf, item);
4496 bctl->flags |= BTRFS_BALANCE_RESUME;
4497
4498 btrfs_balance_data(leaf, item, &disk_bargs);
4499 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4500 btrfs_balance_meta(leaf, item, &disk_bargs);
4501 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4502 btrfs_balance_sys(leaf, item, &disk_bargs);
4503 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
4516 btrfs_warn(fs_info,
4517 "balance: cannot set exclusive op status, resume manually");
4518
4519 btrfs_release_path(path);
4520
4521 mutex_lock(&fs_info->balance_mutex);
4522 BUG_ON(fs_info->balance_ctl);
4523 spin_lock(&fs_info->balance_lock);
4524 fs_info->balance_ctl = bctl;
4525 spin_unlock(&fs_info->balance_lock);
4526 mutex_unlock(&fs_info->balance_mutex);
4527 out:
4528 btrfs_free_path(path);
4529 return ret;
4530 }
4531
4532 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4533 {
4534 int ret = 0;
4535
4536 mutex_lock(&fs_info->balance_mutex);
4537 if (!fs_info->balance_ctl) {
4538 mutex_unlock(&fs_info->balance_mutex);
4539 return -ENOTCONN;
4540 }
4541
4542 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4543 atomic_inc(&fs_info->balance_pause_req);
4544 mutex_unlock(&fs_info->balance_mutex);
4545
4546 wait_event(fs_info->balance_wait_q,
4547 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4548
4549 mutex_lock(&fs_info->balance_mutex);
4550
4551 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4552 atomic_dec(&fs_info->balance_pause_req);
4553 } else {
4554 ret = -ENOTCONN;
4555 }
4556
4557 mutex_unlock(&fs_info->balance_mutex);
4558 return ret;
4559 }
4560
4561 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4562 {
4563 mutex_lock(&fs_info->balance_mutex);
4564 if (!fs_info->balance_ctl) {
4565 mutex_unlock(&fs_info->balance_mutex);
4566 return -ENOTCONN;
4567 }
4568
4569
4570
4571
4572
4573
4574 if (sb_rdonly(fs_info->sb)) {
4575 mutex_unlock(&fs_info->balance_mutex);
4576 return -EROFS;
4577 }
4578
4579 atomic_inc(&fs_info->balance_cancel_req);
4580
4581
4582
4583
4584 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4585 mutex_unlock(&fs_info->balance_mutex);
4586 wait_event(fs_info->balance_wait_q,
4587 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4588 mutex_lock(&fs_info->balance_mutex);
4589 } else {
4590 mutex_unlock(&fs_info->balance_mutex);
4591
4592
4593
4594
4595 mutex_lock(&fs_info->balance_mutex);
4596
4597 if (fs_info->balance_ctl) {
4598 reset_balance_state(fs_info);
4599 btrfs_exclop_finish(fs_info);
4600 btrfs_info(fs_info, "balance: canceled");
4601 }
4602 }
4603
4604 BUG_ON(fs_info->balance_ctl ||
4605 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4606 atomic_dec(&fs_info->balance_cancel_req);
4607 mutex_unlock(&fs_info->balance_mutex);
4608 return 0;
4609 }
4610
4611 int btrfs_uuid_scan_kthread(void *data)
4612 {
4613 struct btrfs_fs_info *fs_info = data;
4614 struct btrfs_root *root = fs_info->tree_root;
4615 struct btrfs_key key;
4616 struct btrfs_path *path = NULL;
4617 int ret = 0;
4618 struct extent_buffer *eb;
4619 int slot;
4620 struct btrfs_root_item root_item;
4621 u32 item_size;
4622 struct btrfs_trans_handle *trans = NULL;
4623 bool closing = false;
4624
4625 path = btrfs_alloc_path();
4626 if (!path) {
4627 ret = -ENOMEM;
4628 goto out;
4629 }
4630
4631 key.objectid = 0;
4632 key.type = BTRFS_ROOT_ITEM_KEY;
4633 key.offset = 0;
4634
4635 while (1) {
4636 if (btrfs_fs_closing(fs_info)) {
4637 closing = true;
4638 break;
4639 }
4640 ret = btrfs_search_forward(root, &key, path,
4641 BTRFS_OLDEST_GENERATION);
4642 if (ret) {
4643 if (ret > 0)
4644 ret = 0;
4645 break;
4646 }
4647
4648 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4649 (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4650 key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4651 key.objectid > BTRFS_LAST_FREE_OBJECTID)
4652 goto skip;
4653
4654 eb = path->nodes[0];
4655 slot = path->slots[0];
4656 item_size = btrfs_item_size(eb, slot);
4657 if (item_size < sizeof(root_item))
4658 goto skip;
4659
4660 read_extent_buffer(eb, &root_item,
4661 btrfs_item_ptr_offset(eb, slot),
4662 (int)sizeof(root_item));
4663 if (btrfs_root_refs(&root_item) == 0)
4664 goto skip;
4665
4666 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4667 !btrfs_is_empty_uuid(root_item.received_uuid)) {
4668 if (trans)
4669 goto update_tree;
4670
4671 btrfs_release_path(path);
4672
4673
4674
4675
4676 trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4677 if (IS_ERR(trans)) {
4678 ret = PTR_ERR(trans);
4679 break;
4680 }
4681 continue;
4682 } else {
4683 goto skip;
4684 }
4685 update_tree:
4686 btrfs_release_path(path);
4687 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4688 ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4689 BTRFS_UUID_KEY_SUBVOL,
4690 key.objectid);
4691 if (ret < 0) {
4692 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4693 ret);
4694 break;
4695 }
4696 }
4697
4698 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4699 ret = btrfs_uuid_tree_add(trans,
4700 root_item.received_uuid,
4701 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4702 key.objectid);
4703 if (ret < 0) {
4704 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4705 ret);
4706 break;
4707 }
4708 }
4709
4710 skip:
4711 btrfs_release_path(path);
4712 if (trans) {
4713 ret = btrfs_end_transaction(trans);
4714 trans = NULL;
4715 if (ret)
4716 break;
4717 }
4718
4719 if (key.offset < (u64)-1) {
4720 key.offset++;
4721 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4722 key.offset = 0;
4723 key.type = BTRFS_ROOT_ITEM_KEY;
4724 } else if (key.objectid < (u64)-1) {
4725 key.offset = 0;
4726 key.type = BTRFS_ROOT_ITEM_KEY;
4727 key.objectid++;
4728 } else {
4729 break;
4730 }
4731 cond_resched();
4732 }
4733
4734 out:
4735 btrfs_free_path(path);
4736 if (trans && !IS_ERR(trans))
4737 btrfs_end_transaction(trans);
4738 if (ret)
4739 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4740 else if (!closing)
4741 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4742 up(&fs_info->uuid_tree_rescan_sem);
4743 return 0;
4744 }
4745
4746 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4747 {
4748 struct btrfs_trans_handle *trans;
4749 struct btrfs_root *tree_root = fs_info->tree_root;
4750 struct btrfs_root *uuid_root;
4751 struct task_struct *task;
4752 int ret;
4753
4754
4755
4756
4757
4758 trans = btrfs_start_transaction(tree_root, 2);
4759 if (IS_ERR(trans))
4760 return PTR_ERR(trans);
4761
4762 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4763 if (IS_ERR(uuid_root)) {
4764 ret = PTR_ERR(uuid_root);
4765 btrfs_abort_transaction(trans, ret);
4766 btrfs_end_transaction(trans);
4767 return ret;
4768 }
4769
4770 fs_info->uuid_root = uuid_root;
4771
4772 ret = btrfs_commit_transaction(trans);
4773 if (ret)
4774 return ret;
4775
4776 down(&fs_info->uuid_tree_rescan_sem);
4777 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4778 if (IS_ERR(task)) {
4779
4780 btrfs_warn(fs_info, "failed to start uuid_scan task");
4781 up(&fs_info->uuid_tree_rescan_sem);
4782 return PTR_ERR(task);
4783 }
4784
4785 return 0;
4786 }
4787
4788
4789
4790
4791
4792
4793 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4794 {
4795 struct btrfs_fs_info *fs_info = device->fs_info;
4796 struct btrfs_root *root = fs_info->dev_root;
4797 struct btrfs_trans_handle *trans;
4798 struct btrfs_dev_extent *dev_extent = NULL;
4799 struct btrfs_path *path;
4800 u64 length;
4801 u64 chunk_offset;
4802 int ret;
4803 int slot;
4804 int failed = 0;
4805 bool retried = false;
4806 struct extent_buffer *l;
4807 struct btrfs_key key;
4808 struct btrfs_super_block *super_copy = fs_info->super_copy;
4809 u64 old_total = btrfs_super_total_bytes(super_copy);
4810 u64 old_size = btrfs_device_get_total_bytes(device);
4811 u64 diff;
4812 u64 start;
4813
4814 new_size = round_down(new_size, fs_info->sectorsize);
4815 start = new_size;
4816 diff = round_down(old_size - new_size, fs_info->sectorsize);
4817
4818 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4819 return -EINVAL;
4820
4821 path = btrfs_alloc_path();
4822 if (!path)
4823 return -ENOMEM;
4824
4825 path->reada = READA_BACK;
4826
4827 trans = btrfs_start_transaction(root, 0);
4828 if (IS_ERR(trans)) {
4829 btrfs_free_path(path);
4830 return PTR_ERR(trans);
4831 }
4832
4833 mutex_lock(&fs_info->chunk_mutex);
4834
4835 btrfs_device_set_total_bytes(device, new_size);
4836 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4837 device->fs_devices->total_rw_bytes -= diff;
4838 atomic64_sub(diff, &fs_info->free_chunk_space);
4839 }
4840
4841
4842
4843
4844
4845
4846 if (contains_pending_extent(device, &start, diff)) {
4847 mutex_unlock(&fs_info->chunk_mutex);
4848 ret = btrfs_commit_transaction(trans);
4849 if (ret)
4850 goto done;
4851 } else {
4852 mutex_unlock(&fs_info->chunk_mutex);
4853 btrfs_end_transaction(trans);
4854 }
4855
4856 again:
4857 key.objectid = device->devid;
4858 key.offset = (u64)-1;
4859 key.type = BTRFS_DEV_EXTENT_KEY;
4860
4861 do {
4862 mutex_lock(&fs_info->reclaim_bgs_lock);
4863 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4864 if (ret < 0) {
4865 mutex_unlock(&fs_info->reclaim_bgs_lock);
4866 goto done;
4867 }
4868
4869 ret = btrfs_previous_item(root, path, 0, key.type);
4870 if (ret) {
4871 mutex_unlock(&fs_info->reclaim_bgs_lock);
4872 if (ret < 0)
4873 goto done;
4874 ret = 0;
4875 btrfs_release_path(path);
4876 break;
4877 }
4878
4879 l = path->nodes[0];
4880 slot = path->slots[0];
4881 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4882
4883 if (key.objectid != device->devid) {
4884 mutex_unlock(&fs_info->reclaim_bgs_lock);
4885 btrfs_release_path(path);
4886 break;
4887 }
4888
4889 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4890 length = btrfs_dev_extent_length(l, dev_extent);
4891
4892 if (key.offset + length <= new_size) {
4893 mutex_unlock(&fs_info->reclaim_bgs_lock);
4894 btrfs_release_path(path);
4895 break;
4896 }
4897
4898 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4899 btrfs_release_path(path);
4900
4901
4902
4903
4904
4905
4906
4907 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4908 if (ret < 0) {
4909 mutex_unlock(&fs_info->reclaim_bgs_lock);
4910 goto done;
4911 }
4912
4913 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4914 mutex_unlock(&fs_info->reclaim_bgs_lock);
4915 if (ret == -ENOSPC) {
4916 failed++;
4917 } else if (ret) {
4918 if (ret == -ETXTBSY) {
4919 btrfs_warn(fs_info,
4920 "could not shrink block group %llu due to active swapfile",
4921 chunk_offset);
4922 }
4923 goto done;
4924 }
4925 } while (key.offset-- > 0);
4926
4927 if (failed && !retried) {
4928 failed = 0;
4929 retried = true;
4930 goto again;
4931 } else if (failed && retried) {
4932 ret = -ENOSPC;
4933 goto done;
4934 }
4935
4936
4937 trans = btrfs_start_transaction(root, 0);
4938 if (IS_ERR(trans)) {
4939 ret = PTR_ERR(trans);
4940 goto done;
4941 }
4942
4943 mutex_lock(&fs_info->chunk_mutex);
4944
4945 clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4946 CHUNK_STATE_MASK);
4947
4948 btrfs_device_set_disk_total_bytes(device, new_size);
4949 if (list_empty(&device->post_commit_list))
4950 list_add_tail(&device->post_commit_list,
4951 &trans->transaction->dev_update_list);
4952
4953 WARN_ON(diff > old_total);
4954 btrfs_set_super_total_bytes(super_copy,
4955 round_down(old_total - diff, fs_info->sectorsize));
4956 mutex_unlock(&fs_info->chunk_mutex);
4957
4958 btrfs_reserve_chunk_metadata(trans, false);
4959
4960 ret = btrfs_update_device(trans, device);
4961 btrfs_trans_release_chunk_metadata(trans);
4962 if (ret < 0) {
4963 btrfs_abort_transaction(trans, ret);
4964 btrfs_end_transaction(trans);
4965 } else {
4966 ret = btrfs_commit_transaction(trans);
4967 }
4968 done:
4969 btrfs_free_path(path);
4970 if (ret) {
4971 mutex_lock(&fs_info->chunk_mutex);
4972 btrfs_device_set_total_bytes(device, old_size);
4973 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4974 device->fs_devices->total_rw_bytes += diff;
4975 atomic64_add(diff, &fs_info->free_chunk_space);
4976 mutex_unlock(&fs_info->chunk_mutex);
4977 }
4978 return ret;
4979 }
4980
4981 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4982 struct btrfs_key *key,
4983 struct btrfs_chunk *chunk, int item_size)
4984 {
4985 struct btrfs_super_block *super_copy = fs_info->super_copy;
4986 struct btrfs_disk_key disk_key;
4987 u32 array_size;
4988 u8 *ptr;
4989
4990 lockdep_assert_held(&fs_info->chunk_mutex);
4991
4992 array_size = btrfs_super_sys_array_size(super_copy);
4993 if (array_size + item_size + sizeof(disk_key)
4994 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4995 return -EFBIG;
4996
4997 ptr = super_copy->sys_chunk_array + array_size;
4998 btrfs_cpu_key_to_disk(&disk_key, key);
4999 memcpy(ptr, &disk_key, sizeof(disk_key));
5000 ptr += sizeof(disk_key);
5001 memcpy(ptr, chunk, item_size);
5002 item_size += sizeof(disk_key);
5003 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
5004
5005 return 0;
5006 }
5007
5008
5009
5010
5011 static int btrfs_cmp_device_info(const void *a, const void *b)
5012 {
5013 const struct btrfs_device_info *di_a = a;
5014 const struct btrfs_device_info *di_b = b;
5015
5016 if (di_a->max_avail > di_b->max_avail)
5017 return -1;
5018 if (di_a->max_avail < di_b->max_avail)
5019 return 1;
5020 if (di_a->total_avail > di_b->total_avail)
5021 return -1;
5022 if (di_a->total_avail < di_b->total_avail)
5023 return 1;
5024 return 0;
5025 }
5026
5027 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
5028 {
5029 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5030 return;
5031
5032 btrfs_set_fs_incompat(info, RAID56);
5033 }
5034
5035 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5036 {
5037 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5038 return;
5039
5040 btrfs_set_fs_incompat(info, RAID1C34);
5041 }
5042
5043
5044
5045
5046
5047 struct alloc_chunk_ctl {
5048 u64 start;
5049 u64 type;
5050
5051 int num_stripes;
5052
5053 int sub_stripes;
5054
5055 int dev_stripes;
5056
5057 int devs_max;
5058
5059 int devs_min;
5060
5061 int devs_increment;
5062
5063 int ncopies;
5064
5065 int nparity;
5066 u64 max_stripe_size;
5067 u64 max_chunk_size;
5068 u64 dev_extent_min;
5069 u64 stripe_size;
5070 u64 chunk_size;
5071 int ndevs;
5072 };
5073
5074 static void init_alloc_chunk_ctl_policy_regular(
5075 struct btrfs_fs_devices *fs_devices,
5076 struct alloc_chunk_ctl *ctl)
5077 {
5078 struct btrfs_space_info *space_info;
5079
5080 space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
5081 ASSERT(space_info);
5082
5083 ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
5084 ctl->max_stripe_size = ctl->max_chunk_size;
5085
5086 if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
5087 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
5088
5089
5090 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
5091 ctl->max_chunk_size);
5092 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
5093 }
5094
5095 static void init_alloc_chunk_ctl_policy_zoned(
5096 struct btrfs_fs_devices *fs_devices,
5097 struct alloc_chunk_ctl *ctl)
5098 {
5099 u64 zone_size = fs_devices->fs_info->zone_size;
5100 u64 limit;
5101 int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5102 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5103 u64 min_chunk_size = min_data_stripes * zone_size;
5104 u64 type = ctl->type;
5105
5106 ctl->max_stripe_size = zone_size;
5107 if (type & BTRFS_BLOCK_GROUP_DATA) {
5108 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5109 zone_size);
5110 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5111 ctl->max_chunk_size = ctl->max_stripe_size;
5112 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5113 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5114 ctl->devs_max = min_t(int, ctl->devs_max,
5115 BTRFS_MAX_DEVS_SYS_CHUNK);
5116 } else {
5117 BUG();
5118 }
5119
5120
5121 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
5122 zone_size),
5123 min_chunk_size);
5124 ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5125 ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5126 }
5127
5128 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5129 struct alloc_chunk_ctl *ctl)
5130 {
5131 int index = btrfs_bg_flags_to_raid_index(ctl->type);
5132
5133 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5134 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5135 ctl->devs_max = btrfs_raid_array[index].devs_max;
5136 if (!ctl->devs_max)
5137 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5138 ctl->devs_min = btrfs_raid_array[index].devs_min;
5139 ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5140 ctl->ncopies = btrfs_raid_array[index].ncopies;
5141 ctl->nparity = btrfs_raid_array[index].nparity;
5142 ctl->ndevs = 0;
5143
5144 switch (fs_devices->chunk_alloc_policy) {
5145 case BTRFS_CHUNK_ALLOC_REGULAR:
5146 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5147 break;
5148 case BTRFS_CHUNK_ALLOC_ZONED:
5149 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5150 break;
5151 default:
5152 BUG();
5153 }
5154 }
5155
5156 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5157 struct alloc_chunk_ctl *ctl,
5158 struct btrfs_device_info *devices_info)
5159 {
5160 struct btrfs_fs_info *info = fs_devices->fs_info;
5161 struct btrfs_device *device;
5162 u64 total_avail;
5163 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5164 int ret;
5165 int ndevs = 0;
5166 u64 max_avail;
5167 u64 dev_offset;
5168
5169
5170
5171
5172
5173 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5174 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5175 WARN(1, KERN_ERR
5176 "BTRFS: read-only device in alloc_list\n");
5177 continue;
5178 }
5179
5180 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5181 &device->dev_state) ||
5182 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5183 continue;
5184
5185 if (device->total_bytes > device->bytes_used)
5186 total_avail = device->total_bytes - device->bytes_used;
5187 else
5188 total_avail = 0;
5189
5190
5191 if (total_avail < ctl->dev_extent_min)
5192 continue;
5193
5194 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5195 &max_avail);
5196 if (ret && ret != -ENOSPC)
5197 return ret;
5198
5199 if (ret == 0)
5200 max_avail = dev_extent_want;
5201
5202 if (max_avail < ctl->dev_extent_min) {
5203 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5204 btrfs_debug(info,
5205 "%s: devid %llu has no free space, have=%llu want=%llu",
5206 __func__, device->devid, max_avail,
5207 ctl->dev_extent_min);
5208 continue;
5209 }
5210
5211 if (ndevs == fs_devices->rw_devices) {
5212 WARN(1, "%s: found more than %llu devices\n",
5213 __func__, fs_devices->rw_devices);
5214 break;
5215 }
5216 devices_info[ndevs].dev_offset = dev_offset;
5217 devices_info[ndevs].max_avail = max_avail;
5218 devices_info[ndevs].total_avail = total_avail;
5219 devices_info[ndevs].dev = device;
5220 ++ndevs;
5221 }
5222 ctl->ndevs = ndevs;
5223
5224
5225
5226
5227 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5228 btrfs_cmp_device_info, NULL);
5229
5230 return 0;
5231 }
5232
5233 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5234 struct btrfs_device_info *devices_info)
5235 {
5236
5237 int data_stripes;
5238
5239
5240
5241
5242
5243
5244
5245
5246 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5247 ctl->dev_stripes);
5248 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5249
5250
5251 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5252
5253
5254
5255
5256
5257
5258
5259 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5260
5261
5262
5263
5264
5265 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5266 data_stripes), SZ_16M),
5267 ctl->stripe_size);
5268 }
5269
5270
5271 ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
5272
5273
5274 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5275 ctl->chunk_size = ctl->stripe_size * data_stripes;
5276
5277 return 0;
5278 }
5279
5280 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5281 struct btrfs_device_info *devices_info)
5282 {
5283 u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5284
5285 int data_stripes;
5286
5287
5288
5289
5290
5291 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5292
5293 ctl->stripe_size = zone_size;
5294 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5295 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5296
5297
5298 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5299 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5300 ctl->stripe_size) + ctl->nparity,
5301 ctl->dev_stripes);
5302 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5303 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5304 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5305 }
5306
5307 ctl->chunk_size = ctl->stripe_size * data_stripes;
5308
5309 return 0;
5310 }
5311
5312 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5313 struct alloc_chunk_ctl *ctl,
5314 struct btrfs_device_info *devices_info)
5315 {
5316 struct btrfs_fs_info *info = fs_devices->fs_info;
5317
5318
5319
5320
5321
5322
5323 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5324
5325 if (ctl->ndevs < ctl->devs_min) {
5326 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5327 btrfs_debug(info,
5328 "%s: not enough devices with free space: have=%d minimum required=%d",
5329 __func__, ctl->ndevs, ctl->devs_min);
5330 }
5331 return -ENOSPC;
5332 }
5333
5334 ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5335
5336 switch (fs_devices->chunk_alloc_policy) {
5337 case BTRFS_CHUNK_ALLOC_REGULAR:
5338 return decide_stripe_size_regular(ctl, devices_info);
5339 case BTRFS_CHUNK_ALLOC_ZONED:
5340 return decide_stripe_size_zoned(ctl, devices_info);
5341 default:
5342 BUG();
5343 }
5344 }
5345
5346 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5347 struct alloc_chunk_ctl *ctl,
5348 struct btrfs_device_info *devices_info)
5349 {
5350 struct btrfs_fs_info *info = trans->fs_info;
5351 struct map_lookup *map = NULL;
5352 struct extent_map_tree *em_tree;
5353 struct btrfs_block_group *block_group;
5354 struct extent_map *em;
5355 u64 start = ctl->start;
5356 u64 type = ctl->type;
5357 int ret;
5358 int i;
5359 int j;
5360
5361 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5362 if (!map)
5363 return ERR_PTR(-ENOMEM);
5364 map->num_stripes = ctl->num_stripes;
5365
5366 for (i = 0; i < ctl->ndevs; ++i) {
5367 for (j = 0; j < ctl->dev_stripes; ++j) {
5368 int s = i * ctl->dev_stripes + j;
5369 map->stripes[s].dev = devices_info[i].dev;
5370 map->stripes[s].physical = devices_info[i].dev_offset +
5371 j * ctl->stripe_size;
5372 }
5373 }
5374 map->stripe_len = BTRFS_STRIPE_LEN;
5375 map->io_align = BTRFS_STRIPE_LEN;
5376 map->io_width = BTRFS_STRIPE_LEN;
5377 map->type = type;
5378 map->sub_stripes = ctl->sub_stripes;
5379
5380 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5381
5382 em = alloc_extent_map();
5383 if (!em) {
5384 kfree(map);
5385 return ERR_PTR(-ENOMEM);
5386 }
5387 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5388 em->map_lookup = map;
5389 em->start = start;
5390 em->len = ctl->chunk_size;
5391 em->block_start = 0;
5392 em->block_len = em->len;
5393 em->orig_block_len = ctl->stripe_size;
5394
5395 em_tree = &info->mapping_tree;
5396 write_lock(&em_tree->lock);
5397 ret = add_extent_mapping(em_tree, em, 0);
5398 if (ret) {
5399 write_unlock(&em_tree->lock);
5400 free_extent_map(em);
5401 return ERR_PTR(ret);
5402 }
5403 write_unlock(&em_tree->lock);
5404
5405 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5406 if (IS_ERR(block_group))
5407 goto error_del_extent;
5408
5409 for (i = 0; i < map->num_stripes; i++) {
5410 struct btrfs_device *dev = map->stripes[i].dev;
5411
5412 btrfs_device_set_bytes_used(dev,
5413 dev->bytes_used + ctl->stripe_size);
5414 if (list_empty(&dev->post_commit_list))
5415 list_add_tail(&dev->post_commit_list,
5416 &trans->transaction->dev_update_list);
5417 }
5418
5419 atomic64_sub(ctl->stripe_size * map->num_stripes,
5420 &info->free_chunk_space);
5421
5422 free_extent_map(em);
5423 check_raid56_incompat_flag(info, type);
5424 check_raid1c34_incompat_flag(info, type);
5425
5426 return block_group;
5427
5428 error_del_extent:
5429 write_lock(&em_tree->lock);
5430 remove_extent_mapping(em_tree, em);
5431 write_unlock(&em_tree->lock);
5432
5433
5434 free_extent_map(em);
5435
5436 free_extent_map(em);
5437
5438 return block_group;
5439 }
5440
5441 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
5442 u64 type)
5443 {
5444 struct btrfs_fs_info *info = trans->fs_info;
5445 struct btrfs_fs_devices *fs_devices = info->fs_devices;
5446 struct btrfs_device_info *devices_info = NULL;
5447 struct alloc_chunk_ctl ctl;
5448 struct btrfs_block_group *block_group;
5449 int ret;
5450
5451 lockdep_assert_held(&info->chunk_mutex);
5452
5453 if (!alloc_profile_is_valid(type, 0)) {
5454 ASSERT(0);
5455 return ERR_PTR(-EINVAL);
5456 }
5457
5458 if (list_empty(&fs_devices->alloc_list)) {
5459 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5460 btrfs_debug(info, "%s: no writable device", __func__);
5461 return ERR_PTR(-ENOSPC);
5462 }
5463
5464 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5465 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5466 ASSERT(0);
5467 return ERR_PTR(-EINVAL);
5468 }
5469
5470 ctl.start = find_next_chunk(info);
5471 ctl.type = type;
5472 init_alloc_chunk_ctl(fs_devices, &ctl);
5473
5474 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5475 GFP_NOFS);
5476 if (!devices_info)
5477 return ERR_PTR(-ENOMEM);
5478
5479 ret = gather_device_info(fs_devices, &ctl, devices_info);
5480 if (ret < 0) {
5481 block_group = ERR_PTR(ret);
5482 goto out;
5483 }
5484
5485 ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5486 if (ret < 0) {
5487 block_group = ERR_PTR(ret);
5488 goto out;
5489 }
5490
5491 block_group = create_chunk(trans, &ctl, devices_info);
5492
5493 out:
5494 kfree(devices_info);
5495 return block_group;
5496 }
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5507 struct btrfs_block_group *bg)
5508 {
5509 struct btrfs_fs_info *fs_info = trans->fs_info;
5510 struct btrfs_root *chunk_root = fs_info->chunk_root;
5511 struct btrfs_key key;
5512 struct btrfs_chunk *chunk;
5513 struct btrfs_stripe *stripe;
5514 struct extent_map *em;
5515 struct map_lookup *map;
5516 size_t item_size;
5517 int i;
5518 int ret;
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542 lockdep_assert_held(&fs_info->chunk_mutex);
5543
5544 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
5545 if (IS_ERR(em)) {
5546 ret = PTR_ERR(em);
5547 btrfs_abort_transaction(trans, ret);
5548 return ret;
5549 }
5550
5551 map = em->map_lookup;
5552 item_size = btrfs_chunk_item_size(map->num_stripes);
5553
5554 chunk = kzalloc(item_size, GFP_NOFS);
5555 if (!chunk) {
5556 ret = -ENOMEM;
5557 btrfs_abort_transaction(trans, ret);
5558 goto out;
5559 }
5560
5561 for (i = 0; i < map->num_stripes; i++) {
5562 struct btrfs_device *device = map->stripes[i].dev;
5563
5564 ret = btrfs_update_device(trans, device);
5565 if (ret)
5566 goto out;
5567 }
5568
5569 stripe = &chunk->stripe;
5570 for (i = 0; i < map->num_stripes; i++) {
5571 struct btrfs_device *device = map->stripes[i].dev;
5572 const u64 dev_offset = map->stripes[i].physical;
5573
5574 btrfs_set_stack_stripe_devid(stripe, device->devid);
5575 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5576 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5577 stripe++;
5578 }
5579
5580 btrfs_set_stack_chunk_length(chunk, bg->length);
5581 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
5582 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5583 btrfs_set_stack_chunk_type(chunk, map->type);
5584 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5585 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5586 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5587 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5588 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5589
5590 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5591 key.type = BTRFS_CHUNK_ITEM_KEY;
5592 key.offset = bg->start;
5593
5594 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5595 if (ret)
5596 goto out;
5597
5598 bg->chunk_item_inserted = 1;
5599
5600 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5601 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5602 if (ret)
5603 goto out;
5604 }
5605
5606 out:
5607 kfree(chunk);
5608 free_extent_map(em);
5609 return ret;
5610 }
5611
5612 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5613 {
5614 struct btrfs_fs_info *fs_info = trans->fs_info;
5615 u64 alloc_profile;
5616 struct btrfs_block_group *meta_bg;
5617 struct btrfs_block_group *sys_bg;
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5641 meta_bg = btrfs_create_chunk(trans, alloc_profile);
5642 if (IS_ERR(meta_bg))
5643 return PTR_ERR(meta_bg);
5644
5645 alloc_profile = btrfs_system_alloc_profile(fs_info);
5646 sys_bg = btrfs_create_chunk(trans, alloc_profile);
5647 if (IS_ERR(sys_bg))
5648 return PTR_ERR(sys_bg);
5649
5650 return 0;
5651 }
5652
5653 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5654 {
5655 const int index = btrfs_bg_flags_to_raid_index(map->type);
5656
5657 return btrfs_raid_array[index].tolerated_failures;
5658 }
5659
5660 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5661 {
5662 struct extent_map *em;
5663 struct map_lookup *map;
5664 int miss_ndevs = 0;
5665 int i;
5666 bool ret = true;
5667
5668 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5669 if (IS_ERR(em))
5670 return false;
5671
5672 map = em->map_lookup;
5673 for (i = 0; i < map->num_stripes; i++) {
5674 if (test_bit(BTRFS_DEV_STATE_MISSING,
5675 &map->stripes[i].dev->dev_state)) {
5676 miss_ndevs++;
5677 continue;
5678 }
5679 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5680 &map->stripes[i].dev->dev_state)) {
5681 ret = false;
5682 goto end;
5683 }
5684 }
5685
5686
5687
5688
5689
5690 if (miss_ndevs > btrfs_chunk_max_errors(map))
5691 ret = false;
5692 end:
5693 free_extent_map(em);
5694 return ret;
5695 }
5696
5697 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5698 {
5699 struct extent_map *em;
5700
5701 while (1) {
5702 write_lock(&tree->lock);
5703 em = lookup_extent_mapping(tree, 0, (u64)-1);
5704 if (em)
5705 remove_extent_mapping(tree, em);
5706 write_unlock(&tree->lock);
5707 if (!em)
5708 break;
5709
5710 free_extent_map(em);
5711
5712 free_extent_map(em);
5713 }
5714 }
5715
5716 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5717 {
5718 struct extent_map *em;
5719 struct map_lookup *map;
5720 enum btrfs_raid_types index;
5721 int ret = 1;
5722
5723 em = btrfs_get_chunk_map(fs_info, logical, len);
5724 if (IS_ERR(em))
5725
5726
5727
5728
5729
5730
5731 return 1;
5732
5733 map = em->map_lookup;
5734 index = btrfs_bg_flags_to_raid_index(map->type);
5735
5736
5737 if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5738 ret = btrfs_raid_array[index].ncopies;
5739 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5740 ret = 2;
5741 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5742
5743
5744
5745
5746
5747
5748
5749 ret = map->num_stripes;
5750 free_extent_map(em);
5751
5752 down_read(&fs_info->dev_replace.rwsem);
5753 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5754 fs_info->dev_replace.tgtdev)
5755 ret++;
5756 up_read(&fs_info->dev_replace.rwsem);
5757
5758 return ret;
5759 }
5760
5761 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5762 u64 logical)
5763 {
5764 struct extent_map *em;
5765 struct map_lookup *map;
5766 unsigned long len = fs_info->sectorsize;
5767
5768 if (!btrfs_fs_incompat(fs_info, RAID56))
5769 return len;
5770
5771 em = btrfs_get_chunk_map(fs_info, logical, len);
5772
5773 if (!WARN_ON(IS_ERR(em))) {
5774 map = em->map_lookup;
5775 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5776 len = map->stripe_len * nr_data_stripes(map);
5777 free_extent_map(em);
5778 }
5779 return len;
5780 }
5781
5782 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5783 {
5784 struct extent_map *em;
5785 struct map_lookup *map;
5786 int ret = 0;
5787
5788 if (!btrfs_fs_incompat(fs_info, RAID56))
5789 return 0;
5790
5791 em = btrfs_get_chunk_map(fs_info, logical, len);
5792
5793 if(!WARN_ON(IS_ERR(em))) {
5794 map = em->map_lookup;
5795 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5796 ret = 1;
5797 free_extent_map(em);
5798 }
5799 return ret;
5800 }
5801
5802 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5803 struct map_lookup *map, int first,
5804 int dev_replace_is_ongoing)
5805 {
5806 int i;
5807 int num_stripes;
5808 int preferred_mirror;
5809 int tolerance;
5810 struct btrfs_device *srcdev;
5811
5812 ASSERT((map->type &
5813 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5814
5815 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5816 num_stripes = map->sub_stripes;
5817 else
5818 num_stripes = map->num_stripes;
5819
5820 switch (fs_info->fs_devices->read_policy) {
5821 default:
5822
5823 btrfs_warn_rl(fs_info,
5824 "unknown read_policy type %u, reset to pid",
5825 fs_info->fs_devices->read_policy);
5826 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5827 fallthrough;
5828 case BTRFS_READ_POLICY_PID:
5829 preferred_mirror = first + (current->pid % num_stripes);
5830 break;
5831 }
5832
5833 if (dev_replace_is_ongoing &&
5834 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5835 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5836 srcdev = fs_info->dev_replace.srcdev;
5837 else
5838 srcdev = NULL;
5839
5840
5841
5842
5843
5844
5845 for (tolerance = 0; tolerance < 2; tolerance++) {
5846 if (map->stripes[preferred_mirror].dev->bdev &&
5847 (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5848 return preferred_mirror;
5849 for (i = first; i < first + num_stripes; i++) {
5850 if (map->stripes[i].dev->bdev &&
5851 (tolerance || map->stripes[i].dev != srcdev))
5852 return i;
5853 }
5854 }
5855
5856
5857
5858
5859 return preferred_mirror;
5860 }
5861
5862
5863 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
5864 {
5865 int i;
5866 int again = 1;
5867
5868 while (again) {
5869 again = 0;
5870 for (i = 0; i < num_stripes - 1; i++) {
5871
5872 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
5873 swap(bioc->stripes[i], bioc->stripes[i + 1]);
5874 swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
5875 again = 1;
5876 }
5877 }
5878 }
5879 }
5880
5881 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
5882 int total_stripes,
5883 int real_stripes)
5884 {
5885 struct btrfs_io_context *bioc = kzalloc(
5886
5887 sizeof(struct btrfs_io_context) +
5888
5889 sizeof(struct btrfs_io_stripe) * (total_stripes) +
5890
5891 sizeof(int) * (real_stripes) +
5892
5893
5894
5895
5896 sizeof(u64) * (total_stripes),
5897 GFP_NOFS|__GFP_NOFAIL);
5898
5899 atomic_set(&bioc->error, 0);
5900 refcount_set(&bioc->refs, 1);
5901
5902 bioc->fs_info = fs_info;
5903 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
5904 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
5905
5906 return bioc;
5907 }
5908
5909 void btrfs_get_bioc(struct btrfs_io_context *bioc)
5910 {
5911 WARN_ON(!refcount_read(&bioc->refs));
5912 refcount_inc(&bioc->refs);
5913 }
5914
5915 void btrfs_put_bioc(struct btrfs_io_context *bioc)
5916 {
5917 if (!bioc)
5918 return;
5919 if (refcount_dec_and_test(&bioc->refs))
5920 kfree(bioc);
5921 }
5922
5923
5924
5925
5926
5927 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
5928 u64 logical, u64 *length_ret,
5929 u32 *num_stripes)
5930 {
5931 struct extent_map *em;
5932 struct map_lookup *map;
5933 struct btrfs_discard_stripe *stripes;
5934 u64 length = *length_ret;
5935 u64 offset;
5936 u64 stripe_nr;
5937 u64 stripe_nr_end;
5938 u64 stripe_end_offset;
5939 u64 stripe_cnt;
5940 u64 stripe_len;
5941 u64 stripe_offset;
5942 u32 stripe_index;
5943 u32 factor = 0;
5944 u32 sub_stripes = 0;
5945 u64 stripes_per_dev = 0;
5946 u32 remaining_stripes = 0;
5947 u32 last_stripe = 0;
5948 int ret;
5949 int i;
5950
5951 em = btrfs_get_chunk_map(fs_info, logical, length);
5952 if (IS_ERR(em))
5953 return ERR_CAST(em);
5954
5955 map = em->map_lookup;
5956
5957
5958 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5959 ret = -EOPNOTSUPP;
5960 goto out_free_map;
5961 }
5962
5963 offset = logical - em->start;
5964 length = min_t(u64, em->start + em->len - logical, length);
5965 *length_ret = length;
5966
5967 stripe_len = map->stripe_len;
5968
5969
5970
5971
5972 stripe_nr = div64_u64(offset, stripe_len);
5973
5974
5975 stripe_offset = offset - stripe_nr * stripe_len;
5976
5977 stripe_nr_end = round_up(offset + length, map->stripe_len);
5978 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5979 stripe_cnt = stripe_nr_end - stripe_nr;
5980 stripe_end_offset = stripe_nr_end * map->stripe_len -
5981 (offset + length);
5982
5983
5984
5985
5986
5987 *num_stripes = 1;
5988 stripe_index = 0;
5989 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5990 BTRFS_BLOCK_GROUP_RAID10)) {
5991 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5992 sub_stripes = 1;
5993 else
5994 sub_stripes = map->sub_stripes;
5995
5996 factor = map->num_stripes / sub_stripes;
5997 *num_stripes = min_t(u64, map->num_stripes,
5998 sub_stripes * stripe_cnt);
5999 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6000 stripe_index *= sub_stripes;
6001 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
6002 &remaining_stripes);
6003 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
6004 last_stripe *= sub_stripes;
6005 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
6006 BTRFS_BLOCK_GROUP_DUP)) {
6007 *num_stripes = map->num_stripes;
6008 } else {
6009 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6010 &stripe_index);
6011 }
6012
6013 stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
6014 if (!stripes) {
6015 ret = -ENOMEM;
6016 goto out_free_map;
6017 }
6018
6019 for (i = 0; i < *num_stripes; i++) {
6020 stripes[i].physical =
6021 map->stripes[stripe_index].physical +
6022 stripe_offset + stripe_nr * map->stripe_len;
6023 stripes[i].dev = map->stripes[stripe_index].dev;
6024
6025 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6026 BTRFS_BLOCK_GROUP_RAID10)) {
6027 stripes[i].length = stripes_per_dev * map->stripe_len;
6028
6029 if (i / sub_stripes < remaining_stripes)
6030 stripes[i].length += map->stripe_len;
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040 if (i < sub_stripes)
6041 stripes[i].length -= stripe_offset;
6042
6043 if (stripe_index >= last_stripe &&
6044 stripe_index <= (last_stripe +
6045 sub_stripes - 1))
6046 stripes[i].length -= stripe_end_offset;
6047
6048 if (i == sub_stripes - 1)
6049 stripe_offset = 0;
6050 } else {
6051 stripes[i].length = length;
6052 }
6053
6054 stripe_index++;
6055 if (stripe_index == map->num_stripes) {
6056 stripe_index = 0;
6057 stripe_nr++;
6058 }
6059 }
6060
6061 free_extent_map(em);
6062 return stripes;
6063 out_free_map:
6064 free_extent_map(em);
6065 return ERR_PTR(ret);
6066 }
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
6082 u64 logical, u64 length,
6083 u64 srcdev_devid, int *mirror_num,
6084 u64 *physical)
6085 {
6086 struct btrfs_io_context *bioc = NULL;
6087 int num_stripes;
6088 int index_srcdev = 0;
6089 int found = 0;
6090 u64 physical_of_found = 0;
6091 int i;
6092 int ret = 0;
6093
6094 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
6095 logical, &length, &bioc, 0, 0);
6096 if (ret) {
6097 ASSERT(bioc == NULL);
6098 return ret;
6099 }
6100
6101 num_stripes = bioc->num_stripes;
6102 if (*mirror_num > num_stripes) {
6103
6104
6105
6106
6107
6108 btrfs_put_bioc(bioc);
6109 return -EIO;
6110 }
6111
6112
6113
6114
6115
6116
6117 for (i = 0; i < num_stripes; i++) {
6118 if (bioc->stripes[i].dev->devid != srcdev_devid)
6119 continue;
6120
6121
6122
6123
6124
6125 if (found &&
6126 physical_of_found <= bioc->stripes[i].physical)
6127 continue;
6128
6129 index_srcdev = i;
6130 found = 1;
6131 physical_of_found = bioc->stripes[i].physical;
6132 }
6133
6134 btrfs_put_bioc(bioc);
6135
6136 ASSERT(found);
6137 if (!found)
6138 return -EIO;
6139
6140 *mirror_num = index_srcdev + 1;
6141 *physical = physical_of_found;
6142 return ret;
6143 }
6144
6145 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6146 {
6147 struct btrfs_block_group *cache;
6148 bool ret;
6149
6150
6151 if (!btrfs_is_zoned(fs_info))
6152 return false;
6153
6154 cache = btrfs_lookup_block_group(fs_info, logical);
6155
6156 spin_lock(&cache->lock);
6157 ret = cache->to_copy;
6158 spin_unlock(&cache->lock);
6159
6160 btrfs_put_block_group(cache);
6161 return ret;
6162 }
6163
6164 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6165 struct btrfs_io_context **bioc_ret,
6166 struct btrfs_dev_replace *dev_replace,
6167 u64 logical,
6168 int *num_stripes_ret, int *max_errors_ret)
6169 {
6170 struct btrfs_io_context *bioc = *bioc_ret;
6171 u64 srcdev_devid = dev_replace->srcdev->devid;
6172 int tgtdev_indexes = 0;
6173 int num_stripes = *num_stripes_ret;
6174 int max_errors = *max_errors_ret;
6175 int i;
6176
6177 if (op == BTRFS_MAP_WRITE) {
6178 int index_where_to_add;
6179
6180
6181
6182
6183
6184 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6185 return;
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198 index_where_to_add = num_stripes;
6199 for (i = 0; i < num_stripes; i++) {
6200 if (bioc->stripes[i].dev->devid == srcdev_devid) {
6201
6202 struct btrfs_io_stripe *new =
6203 bioc->stripes + index_where_to_add;
6204 struct btrfs_io_stripe *old =
6205 bioc->stripes + i;
6206
6207 new->physical = old->physical;
6208 new->dev = dev_replace->tgtdev;
6209 bioc->tgtdev_map[i] = index_where_to_add;
6210 index_where_to_add++;
6211 max_errors++;
6212 tgtdev_indexes++;
6213 }
6214 }
6215 num_stripes = index_where_to_add;
6216 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6217 int index_srcdev = 0;
6218 int found = 0;
6219 u64 physical_of_found = 0;
6220
6221
6222
6223
6224
6225
6226
6227
6228 for (i = 0; i < num_stripes; i++) {
6229 if (bioc->stripes[i].dev->devid == srcdev_devid) {
6230
6231
6232
6233
6234
6235 if (found &&
6236 physical_of_found <= bioc->stripes[i].physical)
6237 continue;
6238 index_srcdev = i;
6239 found = 1;
6240 physical_of_found = bioc->stripes[i].physical;
6241 }
6242 }
6243 if (found) {
6244 struct btrfs_io_stripe *tgtdev_stripe =
6245 bioc->stripes + num_stripes;
6246
6247 tgtdev_stripe->physical = physical_of_found;
6248 tgtdev_stripe->dev = dev_replace->tgtdev;
6249 bioc->tgtdev_map[index_srcdev] = num_stripes;
6250
6251 tgtdev_indexes++;
6252 num_stripes++;
6253 }
6254 }
6255
6256 *num_stripes_ret = num_stripes;
6257 *max_errors_ret = max_errors;
6258 bioc->num_tgtdevs = tgtdev_indexes;
6259 *bioc_ret = bioc;
6260 }
6261
6262 static bool need_full_stripe(enum btrfs_map_op op)
6263 {
6264 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6265 }
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6282 enum btrfs_map_op op, u64 logical,
6283 struct btrfs_io_geometry *io_geom)
6284 {
6285 struct map_lookup *map;
6286 u64 len;
6287 u64 offset;
6288 u64 stripe_offset;
6289 u64 stripe_nr;
6290 u32 stripe_len;
6291 u64 raid56_full_stripe_start = (u64)-1;
6292 int data_stripes;
6293
6294 ASSERT(op != BTRFS_MAP_DISCARD);
6295
6296 map = em->map_lookup;
6297
6298 offset = logical - em->start;
6299
6300 stripe_len = map->stripe_len;
6301
6302
6303
6304
6305 stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
6306 ASSERT(stripe_offset < U32_MAX);
6307
6308 data_stripes = nr_data_stripes(map);
6309
6310
6311 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
6312 u64 max_len = stripe_len - stripe_offset;
6313
6314
6315
6316
6317 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6318 unsigned long full_stripe_len = stripe_len * data_stripes;
6319 raid56_full_stripe_start = offset;
6320
6321
6322
6323
6324
6325 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6326 full_stripe_len);
6327 raid56_full_stripe_start *= full_stripe_len;
6328
6329
6330
6331
6332
6333
6334 if (op == BTRFS_MAP_WRITE) {
6335 max_len = stripe_len * data_stripes -
6336 (offset - raid56_full_stripe_start);
6337 }
6338 }
6339 len = min_t(u64, em->len - offset, max_len);
6340 } else {
6341 len = em->len - offset;
6342 }
6343
6344 io_geom->len = len;
6345 io_geom->offset = offset;
6346 io_geom->stripe_len = stripe_len;
6347 io_geom->stripe_nr = stripe_nr;
6348 io_geom->stripe_offset = stripe_offset;
6349 io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6350
6351 return 0;
6352 }
6353
6354 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6355 enum btrfs_map_op op,
6356 u64 logical, u64 *length,
6357 struct btrfs_io_context **bioc_ret,
6358 int mirror_num, int need_raid_map)
6359 {
6360 struct extent_map *em;
6361 struct map_lookup *map;
6362 u64 stripe_offset;
6363 u64 stripe_nr;
6364 u64 stripe_len;
6365 u32 stripe_index;
6366 int data_stripes;
6367 int i;
6368 int ret = 0;
6369 int num_stripes;
6370 int max_errors = 0;
6371 int tgtdev_indexes = 0;
6372 struct btrfs_io_context *bioc = NULL;
6373 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6374 int dev_replace_is_ongoing = 0;
6375 int num_alloc_stripes;
6376 int patch_the_first_stripe_for_dev_replace = 0;
6377 u64 physical_to_patch_in_first_stripe = 0;
6378 u64 raid56_full_stripe_start = (u64)-1;
6379 struct btrfs_io_geometry geom;
6380
6381 ASSERT(bioc_ret);
6382 ASSERT(op != BTRFS_MAP_DISCARD);
6383
6384 em = btrfs_get_chunk_map(fs_info, logical, *length);
6385 ASSERT(!IS_ERR(em));
6386
6387 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
6388 if (ret < 0)
6389 return ret;
6390
6391 map = em->map_lookup;
6392
6393 *length = geom.len;
6394 stripe_len = geom.stripe_len;
6395 stripe_nr = geom.stripe_nr;
6396 stripe_offset = geom.stripe_offset;
6397 raid56_full_stripe_start = geom.raid56_stripe_offset;
6398 data_stripes = nr_data_stripes(map);
6399
6400 down_read(&dev_replace->rwsem);
6401 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6402
6403
6404
6405
6406 if (!dev_replace_is_ongoing)
6407 up_read(&dev_replace->rwsem);
6408
6409 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6410 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6411 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6412 dev_replace->srcdev->devid,
6413 &mirror_num,
6414 &physical_to_patch_in_first_stripe);
6415 if (ret)
6416 goto out;
6417 else
6418 patch_the_first_stripe_for_dev_replace = 1;
6419 } else if (mirror_num > map->num_stripes) {
6420 mirror_num = 0;
6421 }
6422
6423 num_stripes = 1;
6424 stripe_index = 0;
6425 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6426 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6427 &stripe_index);
6428 if (!need_full_stripe(op))
6429 mirror_num = 1;
6430 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6431 if (need_full_stripe(op))
6432 num_stripes = map->num_stripes;
6433 else if (mirror_num)
6434 stripe_index = mirror_num - 1;
6435 else {
6436 stripe_index = find_live_mirror(fs_info, map, 0,
6437 dev_replace_is_ongoing);
6438 mirror_num = stripe_index + 1;
6439 }
6440
6441 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6442 if (need_full_stripe(op)) {
6443 num_stripes = map->num_stripes;
6444 } else if (mirror_num) {
6445 stripe_index = mirror_num - 1;
6446 } else {
6447 mirror_num = 1;
6448 }
6449
6450 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6451 u32 factor = map->num_stripes / map->sub_stripes;
6452
6453 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6454 stripe_index *= map->sub_stripes;
6455
6456 if (need_full_stripe(op))
6457 num_stripes = map->sub_stripes;
6458 else if (mirror_num)
6459 stripe_index += mirror_num - 1;
6460 else {
6461 int old_stripe_index = stripe_index;
6462 stripe_index = find_live_mirror(fs_info, map,
6463 stripe_index,
6464 dev_replace_is_ongoing);
6465 mirror_num = stripe_index - old_stripe_index + 1;
6466 }
6467
6468 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6469 ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
6470 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6471
6472 stripe_nr = div64_u64(raid56_full_stripe_start,
6473 stripe_len * data_stripes);
6474
6475
6476 num_stripes = map->num_stripes;
6477 max_errors = btrfs_chunk_max_errors(map);
6478
6479
6480 *length = min(logical + *length,
6481 raid56_full_stripe_start + em->start +
6482 data_stripes * stripe_len) - logical;
6483 stripe_index = 0;
6484 stripe_offset = 0;
6485 } else {
6486
6487
6488
6489
6490
6491 stripe_nr = div_u64_rem(stripe_nr,
6492 data_stripes, &stripe_index);
6493 if (mirror_num > 1)
6494 stripe_index = data_stripes + mirror_num - 2;
6495
6496
6497 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6498 &stripe_index);
6499 if (!need_full_stripe(op) && mirror_num <= 1)
6500 mirror_num = 1;
6501 }
6502 } else {
6503
6504
6505
6506
6507
6508 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6509 &stripe_index);
6510 mirror_num = stripe_index + 1;
6511 }
6512 if (stripe_index >= map->num_stripes) {
6513 btrfs_crit(fs_info,
6514 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6515 stripe_index, map->num_stripes);
6516 ret = -EINVAL;
6517 goto out;
6518 }
6519
6520 num_alloc_stripes = num_stripes;
6521 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6522 if (op == BTRFS_MAP_WRITE)
6523 num_alloc_stripes <<= 1;
6524 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6525 num_alloc_stripes++;
6526 tgtdev_indexes = num_stripes;
6527 }
6528
6529 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
6530 if (!bioc) {
6531 ret = -ENOMEM;
6532 goto out;
6533 }
6534
6535 for (i = 0; i < num_stripes; i++) {
6536 bioc->stripes[i].physical = map->stripes[stripe_index].physical +
6537 stripe_offset + stripe_nr * map->stripe_len;
6538 bioc->stripes[i].dev = map->stripes[stripe_index].dev;
6539 stripe_index++;
6540 }
6541
6542
6543 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6544 (need_full_stripe(op) || mirror_num > 1)) {
6545 u64 tmp;
6546 unsigned rot;
6547
6548
6549 div_u64_rem(stripe_nr, num_stripes, &rot);
6550
6551
6552 tmp = stripe_nr * data_stripes;
6553 for (i = 0; i < data_stripes; i++)
6554 bioc->raid_map[(i + rot) % num_stripes] =
6555 em->start + (tmp + i) * map->stripe_len;
6556
6557 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
6558 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6559 bioc->raid_map[(i + rot + 1) % num_stripes] =
6560 RAID6_Q_STRIPE;
6561
6562 sort_parity_stripes(bioc, num_stripes);
6563 }
6564
6565 if (need_full_stripe(op))
6566 max_errors = btrfs_chunk_max_errors(map);
6567
6568 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6569 need_full_stripe(op)) {
6570 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
6571 &num_stripes, &max_errors);
6572 }
6573
6574 *bioc_ret = bioc;
6575 bioc->map_type = map->type;
6576 bioc->num_stripes = num_stripes;
6577 bioc->max_errors = max_errors;
6578 bioc->mirror_num = mirror_num;
6579
6580
6581
6582
6583
6584
6585 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6586 WARN_ON(num_stripes > 1);
6587 bioc->stripes[0].dev = dev_replace->tgtdev;
6588 bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
6589 bioc->mirror_num = map->num_stripes + 1;
6590 }
6591 out:
6592 if (dev_replace_is_ongoing) {
6593 lockdep_assert_held(&dev_replace->rwsem);
6594
6595 up_read(&dev_replace->rwsem);
6596 }
6597 free_extent_map(em);
6598 return ret;
6599 }
6600
6601 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6602 u64 logical, u64 *length,
6603 struct btrfs_io_context **bioc_ret, int mirror_num)
6604 {
6605 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
6606 mirror_num, 0);
6607 }
6608
6609
6610 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6611 u64 logical, u64 *length,
6612 struct btrfs_io_context **bioc_ret)
6613 {
6614 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
6615 }
6616
6617 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
6618 {
6619 if (bioc->orig_bio->bi_opf & REQ_META)
6620 return bioc->fs_info->endio_meta_workers;
6621 return bioc->fs_info->endio_workers;
6622 }
6623
6624 static void btrfs_end_bio_work(struct work_struct *work)
6625 {
6626 struct btrfs_bio *bbio =
6627 container_of(work, struct btrfs_bio, end_io_work);
6628
6629 bio_endio(&bbio->bio);
6630 }
6631
6632 static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
6633 {
6634 struct bio *orig_bio = bioc->orig_bio;
6635 struct btrfs_bio *bbio = btrfs_bio(orig_bio);
6636
6637 bbio->mirror_num = bioc->mirror_num;
6638 orig_bio->bi_private = bioc->private;
6639 orig_bio->bi_end_io = bioc->end_io;
6640
6641
6642
6643
6644
6645 if (atomic_read(&bioc->error) > bioc->max_errors)
6646 orig_bio->bi_status = BLK_STS_IOERR;
6647 else
6648 orig_bio->bi_status = BLK_STS_OK;
6649
6650 if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
6651 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
6652 queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
6653 } else {
6654 bio_endio(orig_bio);
6655 }
6656
6657 btrfs_put_bioc(bioc);
6658 }
6659
6660 static void btrfs_end_bio(struct bio *bio)
6661 {
6662 struct btrfs_io_stripe *stripe = bio->bi_private;
6663 struct btrfs_io_context *bioc = stripe->bioc;
6664
6665 if (bio->bi_status) {
6666 atomic_inc(&bioc->error);
6667 if (bio->bi_status == BLK_STS_IOERR ||
6668 bio->bi_status == BLK_STS_TARGET) {
6669 if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6670 btrfs_dev_stat_inc_and_print(stripe->dev,
6671 BTRFS_DEV_STAT_WRITE_ERRS);
6672 else if (!(bio->bi_opf & REQ_RAHEAD))
6673 btrfs_dev_stat_inc_and_print(stripe->dev,
6674 BTRFS_DEV_STAT_READ_ERRS);
6675 if (bio->bi_opf & REQ_PREFLUSH)
6676 btrfs_dev_stat_inc_and_print(stripe->dev,
6677 BTRFS_DEV_STAT_FLUSH_ERRS);
6678 }
6679 }
6680
6681 if (bio != bioc->orig_bio)
6682 bio_put(bio);
6683
6684 btrfs_bio_counter_dec(bioc->fs_info);
6685 if (atomic_dec_and_test(&bioc->stripes_pending))
6686 btrfs_end_bioc(bioc, true);
6687 }
6688
6689 static void submit_stripe_bio(struct btrfs_io_context *bioc,
6690 struct bio *orig_bio, int dev_nr, bool clone)
6691 {
6692 struct btrfs_fs_info *fs_info = bioc->fs_info;
6693 struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
6694 u64 physical = bioc->stripes[dev_nr].physical;
6695 struct bio *bio;
6696
6697 if (!dev || !dev->bdev ||
6698 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6699 (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
6700 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6701 atomic_inc(&bioc->error);
6702 if (atomic_dec_and_test(&bioc->stripes_pending))
6703 btrfs_end_bioc(bioc, false);
6704 return;
6705 }
6706
6707 if (clone) {
6708 bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set);
6709 } else {
6710 bio = orig_bio;
6711 bio_set_dev(bio, dev->bdev);
6712 btrfs_bio(bio)->device = dev;
6713 }
6714
6715 bioc->stripes[dev_nr].bioc = bioc;
6716 bio->bi_private = &bioc->stripes[dev_nr];
6717 bio->bi_end_io = btrfs_end_bio;
6718 bio->bi_iter.bi_sector = physical >> 9;
6719
6720
6721
6722
6723 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6724 if (btrfs_dev_is_sequential(dev, physical)) {
6725 u64 zone_start = round_down(physical, fs_info->zone_size);
6726
6727 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6728 } else {
6729 bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6730 bio->bi_opf |= REQ_OP_WRITE;
6731 }
6732 }
6733 btrfs_debug_in_rcu(fs_info,
6734 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6735 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6736 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6737 dev->devid, bio->bi_iter.bi_size);
6738
6739 btrfs_bio_counter_inc_noblocked(fs_info);
6740
6741 btrfsic_check_bio(bio);
6742 submit_bio(bio);
6743 }
6744
6745 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
6746 {
6747 u64 logical = bio->bi_iter.bi_sector << 9;
6748 u64 length = bio->bi_iter.bi_size;
6749 u64 map_length = length;
6750 int ret;
6751 int dev_nr;
6752 int total_devs;
6753 struct btrfs_io_context *bioc = NULL;
6754
6755 btrfs_bio_counter_inc_blocked(fs_info);
6756 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6757 &map_length, &bioc, mirror_num, 1);
6758 if (ret) {
6759 btrfs_bio_counter_dec(fs_info);
6760 bio->bi_status = errno_to_blk_status(ret);
6761 bio_endio(bio);
6762 return;
6763 }
6764
6765 total_devs = bioc->num_stripes;
6766 bioc->orig_bio = bio;
6767 bioc->private = bio->bi_private;
6768 bioc->end_io = bio->bi_end_io;
6769 atomic_set(&bioc->stripes_pending, total_devs);
6770
6771 if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6772 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
6773 if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6774 raid56_parity_write(bio, bioc);
6775 else
6776 raid56_parity_recover(bio, bioc, mirror_num, true);
6777 return;
6778 }
6779
6780 if (map_length < length) {
6781 btrfs_crit(fs_info,
6782 "mapping failed logical %llu bio len %llu len %llu",
6783 logical, length, map_length);
6784 BUG();
6785 }
6786
6787 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6788 const bool should_clone = (dev_nr < total_devs - 1);
6789
6790 submit_stripe_bio(bioc, bio, dev_nr, should_clone);
6791 }
6792 btrfs_bio_counter_dec(fs_info);
6793 }
6794
6795 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6796 const struct btrfs_fs_devices *fs_devices)
6797 {
6798 if (args->fsid == NULL)
6799 return true;
6800 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
6801 return true;
6802 return false;
6803 }
6804
6805 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6806 const struct btrfs_device *device)
6807 {
6808 ASSERT((args->devid != (u64)-1) || args->missing);
6809
6810 if ((args->devid != (u64)-1) && device->devid != args->devid)
6811 return false;
6812 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
6813 return false;
6814 if (!args->missing)
6815 return true;
6816 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6817 !device->bdev)
6818 return true;
6819 return false;
6820 }
6821
6822
6823
6824
6825
6826
6827
6828
6829 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
6830 const struct btrfs_dev_lookup_args *args)
6831 {
6832 struct btrfs_device *device;
6833 struct btrfs_fs_devices *seed_devs;
6834
6835 if (dev_args_match_fs_devices(args, fs_devices)) {
6836 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6837 if (dev_args_match_device(args, device))
6838 return device;
6839 }
6840 }
6841
6842 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6843 if (!dev_args_match_fs_devices(args, seed_devs))
6844 continue;
6845 list_for_each_entry(device, &seed_devs->devices, dev_list) {
6846 if (dev_args_match_device(args, device))
6847 return device;
6848 }
6849 }
6850
6851 return NULL;
6852 }
6853
6854 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6855 u64 devid, u8 *dev_uuid)
6856 {
6857 struct btrfs_device *device;
6858 unsigned int nofs_flag;
6859
6860
6861
6862
6863
6864
6865
6866 nofs_flag = memalloc_nofs_save();
6867 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6868 memalloc_nofs_restore(nofs_flag);
6869 if (IS_ERR(device))
6870 return device;
6871
6872 list_add(&device->dev_list, &fs_devices->devices);
6873 device->fs_devices = fs_devices;
6874 fs_devices->num_devices++;
6875
6876 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6877 fs_devices->missing_devices++;
6878
6879 return device;
6880 }
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6896 const u64 *devid,
6897 const u8 *uuid)
6898 {
6899 struct btrfs_device *dev;
6900 u64 tmp;
6901
6902 if (WARN_ON(!devid && !fs_info))
6903 return ERR_PTR(-EINVAL);
6904
6905 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6906 if (!dev)
6907 return ERR_PTR(-ENOMEM);
6908
6909 INIT_LIST_HEAD(&dev->dev_list);
6910 INIT_LIST_HEAD(&dev->dev_alloc_list);
6911 INIT_LIST_HEAD(&dev->post_commit_list);
6912
6913 atomic_set(&dev->dev_stats_ccnt, 0);
6914 btrfs_device_data_ordered_init(dev);
6915 extent_io_tree_init(fs_info, &dev->alloc_state,
6916 IO_TREE_DEVICE_ALLOC_STATE, NULL);
6917
6918 if (devid)
6919 tmp = *devid;
6920 else {
6921 int ret;
6922
6923 ret = find_next_devid(fs_info, &tmp);
6924 if (ret) {
6925 btrfs_free_device(dev);
6926 return ERR_PTR(ret);
6927 }
6928 }
6929 dev->devid = tmp;
6930
6931 if (uuid)
6932 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6933 else
6934 generate_random_uuid(dev->uuid);
6935
6936 return dev;
6937 }
6938
6939 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6940 u64 devid, u8 *uuid, bool error)
6941 {
6942 if (error)
6943 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6944 devid, uuid);
6945 else
6946 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6947 devid, uuid);
6948 }
6949
6950 u64 btrfs_calc_stripe_length(const struct extent_map *em)
6951 {
6952 const struct map_lookup *map = em->map_lookup;
6953 const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
6954
6955 return div_u64(em->len, data_stripes);
6956 }
6957
6958 #if BITS_PER_LONG == 32
6959
6960
6961
6962
6963
6964
6965
6966 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6967 u64 logical, u64 length, u64 type)
6968 {
6969 if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6970 return 0;
6971
6972 if (logical + length < MAX_LFS_FILESIZE)
6973 return 0;
6974
6975 btrfs_err_32bit_limit(fs_info);
6976 return -EOVERFLOW;
6977 }
6978
6979
6980
6981
6982
6983
6984
6985 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6986 u64 logical, u64 length, u64 type)
6987 {
6988 if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6989 return;
6990
6991 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6992 return;
6993
6994 btrfs_warn_32bit_limit(fs_info);
6995 }
6996 #endif
6997
6998 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
6999 u64 devid, u8 *uuid)
7000 {
7001 struct btrfs_device *dev;
7002
7003 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7004 btrfs_report_missing_device(fs_info, devid, uuid, true);
7005 return ERR_PTR(-ENOENT);
7006 }
7007
7008 dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
7009 if (IS_ERR(dev)) {
7010 btrfs_err(fs_info, "failed to init missing device %llu: %ld",
7011 devid, PTR_ERR(dev));
7012 return dev;
7013 }
7014 btrfs_report_missing_device(fs_info, devid, uuid, false);
7015
7016 return dev;
7017 }
7018
7019 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
7020 struct btrfs_chunk *chunk)
7021 {
7022 BTRFS_DEV_LOOKUP_ARGS(args);
7023 struct btrfs_fs_info *fs_info = leaf->fs_info;
7024 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7025 struct map_lookup *map;
7026 struct extent_map *em;
7027 u64 logical;
7028 u64 length;
7029 u64 devid;
7030 u64 type;
7031 u8 uuid[BTRFS_UUID_SIZE];
7032 int num_stripes;
7033 int ret;
7034 int i;
7035
7036 logical = key->offset;
7037 length = btrfs_chunk_length(leaf, chunk);
7038 type = btrfs_chunk_type(leaf, chunk);
7039 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
7040
7041 #if BITS_PER_LONG == 32
7042 ret = check_32bit_meta_chunk(fs_info, logical, length, type);
7043 if (ret < 0)
7044 return ret;
7045 warn_32bit_meta_chunk(fs_info, logical, length, type);
7046 #endif
7047
7048
7049
7050
7051
7052 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
7053 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
7054 if (ret)
7055 return ret;
7056 }
7057
7058 read_lock(&map_tree->lock);
7059 em = lookup_extent_mapping(map_tree, logical, 1);
7060 read_unlock(&map_tree->lock);
7061
7062
7063 if (em && em->start <= logical && em->start + em->len > logical) {
7064 free_extent_map(em);
7065 return 0;
7066 } else if (em) {
7067 free_extent_map(em);
7068 }
7069
7070 em = alloc_extent_map();
7071 if (!em)
7072 return -ENOMEM;
7073 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
7074 if (!map) {
7075 free_extent_map(em);
7076 return -ENOMEM;
7077 }
7078
7079 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
7080 em->map_lookup = map;
7081 em->start = logical;
7082 em->len = length;
7083 em->orig_start = 0;
7084 em->block_start = 0;
7085 em->block_len = em->len;
7086
7087 map->num_stripes = num_stripes;
7088 map->io_width = btrfs_chunk_io_width(leaf, chunk);
7089 map->io_align = btrfs_chunk_io_align(leaf, chunk);
7090 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
7091 map->type = type;
7092 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
7093 map->verified_stripes = 0;
7094 em->orig_block_len = btrfs_calc_stripe_length(em);
7095 for (i = 0; i < num_stripes; i++) {
7096 map->stripes[i].physical =
7097 btrfs_stripe_offset_nr(leaf, chunk, i);
7098 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7099 args.devid = devid;
7100 read_extent_buffer(leaf, uuid, (unsigned long)
7101 btrfs_stripe_dev_uuid_nr(chunk, i),
7102 BTRFS_UUID_SIZE);
7103 args.uuid = uuid;
7104 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
7105 if (!map->stripes[i].dev) {
7106 map->stripes[i].dev = handle_missing_device(fs_info,
7107 devid, uuid);
7108 if (IS_ERR(map->stripes[i].dev)) {
7109 free_extent_map(em);
7110 return PTR_ERR(map->stripes[i].dev);
7111 }
7112 }
7113
7114 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
7115 &(map->stripes[i].dev->dev_state));
7116 }
7117
7118 write_lock(&map_tree->lock);
7119 ret = add_extent_mapping(map_tree, em, 0);
7120 write_unlock(&map_tree->lock);
7121 if (ret < 0) {
7122 btrfs_err(fs_info,
7123 "failed to add chunk map, start=%llu len=%llu: %d",
7124 em->start, em->len, ret);
7125 }
7126 free_extent_map(em);
7127
7128 return ret;
7129 }
7130
7131 static void fill_device_from_item(struct extent_buffer *leaf,
7132 struct btrfs_dev_item *dev_item,
7133 struct btrfs_device *device)
7134 {
7135 unsigned long ptr;
7136
7137 device->devid = btrfs_device_id(leaf, dev_item);
7138 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
7139 device->total_bytes = device->disk_total_bytes;
7140 device->commit_total_bytes = device->disk_total_bytes;
7141 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7142 device->commit_bytes_used = device->bytes_used;
7143 device->type = btrfs_device_type(leaf, dev_item);
7144 device->io_align = btrfs_device_io_align(leaf, dev_item);
7145 device->io_width = btrfs_device_io_width(leaf, dev_item);
7146 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
7147 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7148 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
7149
7150 ptr = btrfs_device_uuid(dev_item);
7151 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
7152 }
7153
7154 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7155 u8 *fsid)
7156 {
7157 struct btrfs_fs_devices *fs_devices;
7158 int ret;
7159
7160 lockdep_assert_held(&uuid_mutex);
7161 ASSERT(fsid);
7162
7163
7164 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7165 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7166 return fs_devices;
7167
7168
7169 fs_devices = find_fsid(fsid, NULL);
7170 if (!fs_devices) {
7171 if (!btrfs_test_opt(fs_info, DEGRADED))
7172 return ERR_PTR(-ENOENT);
7173
7174 fs_devices = alloc_fs_devices(fsid, NULL);
7175 if (IS_ERR(fs_devices))
7176 return fs_devices;
7177
7178 fs_devices->seeding = true;
7179 fs_devices->opened = 1;
7180 return fs_devices;
7181 }
7182
7183
7184
7185
7186
7187 fs_devices = clone_fs_devices(fs_devices);
7188 if (IS_ERR(fs_devices))
7189 return fs_devices;
7190
7191 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
7192 if (ret) {
7193 free_fs_devices(fs_devices);
7194 return ERR_PTR(ret);
7195 }
7196
7197 if (!fs_devices->seeding) {
7198 close_fs_devices(fs_devices);
7199 free_fs_devices(fs_devices);
7200 return ERR_PTR(-EINVAL);
7201 }
7202
7203 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7204
7205 return fs_devices;
7206 }
7207
7208 static int read_one_dev(struct extent_buffer *leaf,
7209 struct btrfs_dev_item *dev_item)
7210 {
7211 BTRFS_DEV_LOOKUP_ARGS(args);
7212 struct btrfs_fs_info *fs_info = leaf->fs_info;
7213 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7214 struct btrfs_device *device;
7215 u64 devid;
7216 int ret;
7217 u8 fs_uuid[BTRFS_FSID_SIZE];
7218 u8 dev_uuid[BTRFS_UUID_SIZE];
7219
7220 devid = btrfs_device_id(leaf, dev_item);
7221 args.devid = devid;
7222 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7223 BTRFS_UUID_SIZE);
7224 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7225 BTRFS_FSID_SIZE);
7226 args.uuid = dev_uuid;
7227 args.fsid = fs_uuid;
7228
7229 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7230 fs_devices = open_seed_devices(fs_info, fs_uuid);
7231 if (IS_ERR(fs_devices))
7232 return PTR_ERR(fs_devices);
7233 }
7234
7235 device = btrfs_find_device(fs_info->fs_devices, &args);
7236 if (!device) {
7237 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7238 btrfs_report_missing_device(fs_info, devid,
7239 dev_uuid, true);
7240 return -ENOENT;
7241 }
7242
7243 device = add_missing_dev(fs_devices, devid, dev_uuid);
7244 if (IS_ERR(device)) {
7245 btrfs_err(fs_info,
7246 "failed to add missing dev %llu: %ld",
7247 devid, PTR_ERR(device));
7248 return PTR_ERR(device);
7249 }
7250 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7251 } else {
7252 if (!device->bdev) {
7253 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7254 btrfs_report_missing_device(fs_info,
7255 devid, dev_uuid, true);
7256 return -ENOENT;
7257 }
7258 btrfs_report_missing_device(fs_info, devid,
7259 dev_uuid, false);
7260 }
7261
7262 if (!device->bdev &&
7263 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7264
7265
7266
7267
7268
7269
7270 device->fs_devices->missing_devices++;
7271 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
7272 }
7273
7274
7275 if (device->fs_devices != fs_devices) {
7276 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7277 &device->dev_state));
7278
7279 list_move(&device->dev_list, &fs_devices->devices);
7280 device->fs_devices->num_devices--;
7281 fs_devices->num_devices++;
7282
7283 device->fs_devices->missing_devices--;
7284 fs_devices->missing_devices++;
7285
7286 device->fs_devices = fs_devices;
7287 }
7288 }
7289
7290 if (device->fs_devices != fs_info->fs_devices) {
7291 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7292 if (device->generation !=
7293 btrfs_device_generation(leaf, dev_item))
7294 return -EINVAL;
7295 }
7296
7297 fill_device_from_item(leaf, dev_item, device);
7298 if (device->bdev) {
7299 u64 max_total_bytes = bdev_nr_bytes(device->bdev);
7300
7301 if (device->total_bytes > max_total_bytes) {
7302 btrfs_err(fs_info,
7303 "device total_bytes should be at most %llu but found %llu",
7304 max_total_bytes, device->total_bytes);
7305 return -EINVAL;
7306 }
7307 }
7308 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7309 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7310 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7311 device->fs_devices->total_rw_bytes += device->total_bytes;
7312 atomic64_add(device->total_bytes - device->bytes_used,
7313 &fs_info->free_chunk_space);
7314 }
7315 ret = 0;
7316 return ret;
7317 }
7318
7319 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7320 {
7321 struct btrfs_super_block *super_copy = fs_info->super_copy;
7322 struct extent_buffer *sb;
7323 struct btrfs_disk_key *disk_key;
7324 struct btrfs_chunk *chunk;
7325 u8 *array_ptr;
7326 unsigned long sb_array_offset;
7327 int ret = 0;
7328 u32 num_stripes;
7329 u32 array_size;
7330 u32 len = 0;
7331 u32 cur_offset;
7332 u64 type;
7333 struct btrfs_key key;
7334
7335 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7336
7337
7338
7339
7340
7341
7342 sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7343 if (!sb)
7344 return -ENOMEM;
7345 set_extent_buffer_uptodate(sb);
7346
7347 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7348 array_size = btrfs_super_sys_array_size(super_copy);
7349
7350 array_ptr = super_copy->sys_chunk_array;
7351 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7352 cur_offset = 0;
7353
7354 while (cur_offset < array_size) {
7355 disk_key = (struct btrfs_disk_key *)array_ptr;
7356 len = sizeof(*disk_key);
7357 if (cur_offset + len > array_size)
7358 goto out_short_read;
7359
7360 btrfs_disk_key_to_cpu(&key, disk_key);
7361
7362 array_ptr += len;
7363 sb_array_offset += len;
7364 cur_offset += len;
7365
7366 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7367 btrfs_err(fs_info,
7368 "unexpected item type %u in sys_array at offset %u",
7369 (u32)key.type, cur_offset);
7370 ret = -EIO;
7371 break;
7372 }
7373
7374 chunk = (struct btrfs_chunk *)sb_array_offset;
7375
7376
7377
7378
7379 len = btrfs_chunk_item_size(1);
7380 if (cur_offset + len > array_size)
7381 goto out_short_read;
7382
7383 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7384 if (!num_stripes) {
7385 btrfs_err(fs_info,
7386 "invalid number of stripes %u in sys_array at offset %u",
7387 num_stripes, cur_offset);
7388 ret = -EIO;
7389 break;
7390 }
7391
7392 type = btrfs_chunk_type(sb, chunk);
7393 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7394 btrfs_err(fs_info,
7395 "invalid chunk type %llu in sys_array at offset %u",
7396 type, cur_offset);
7397 ret = -EIO;
7398 break;
7399 }
7400
7401 len = btrfs_chunk_item_size(num_stripes);
7402 if (cur_offset + len > array_size)
7403 goto out_short_read;
7404
7405 ret = read_one_chunk(&key, sb, chunk);
7406 if (ret)
7407 break;
7408
7409 array_ptr += len;
7410 sb_array_offset += len;
7411 cur_offset += len;
7412 }
7413 clear_extent_buffer_uptodate(sb);
7414 free_extent_buffer_stale(sb);
7415 return ret;
7416
7417 out_short_read:
7418 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7419 len, cur_offset);
7420 clear_extent_buffer_uptodate(sb);
7421 free_extent_buffer_stale(sb);
7422 return -EIO;
7423 }
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7434 struct btrfs_device *failing_dev)
7435 {
7436 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7437 struct extent_map *em;
7438 u64 next_start = 0;
7439 bool ret = true;
7440
7441 read_lock(&map_tree->lock);
7442 em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7443 read_unlock(&map_tree->lock);
7444
7445 if (!em) {
7446 ret = false;
7447 goto out;
7448 }
7449 while (em) {
7450 struct map_lookup *map;
7451 int missing = 0;
7452 int max_tolerated;
7453 int i;
7454
7455 map = em->map_lookup;
7456 max_tolerated =
7457 btrfs_get_num_tolerated_disk_barrier_failures(
7458 map->type);
7459 for (i = 0; i < map->num_stripes; i++) {
7460 struct btrfs_device *dev = map->stripes[i].dev;
7461
7462 if (!dev || !dev->bdev ||
7463 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7464 dev->last_flush_error)
7465 missing++;
7466 else if (failing_dev && failing_dev == dev)
7467 missing++;
7468 }
7469 if (missing > max_tolerated) {
7470 if (!failing_dev)
7471 btrfs_warn(fs_info,
7472 "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7473 em->start, missing, max_tolerated);
7474 free_extent_map(em);
7475 ret = false;
7476 goto out;
7477 }
7478 next_start = extent_map_end(em);
7479 free_extent_map(em);
7480
7481 read_lock(&map_tree->lock);
7482 em = lookup_extent_mapping(map_tree, next_start,
7483 (u64)(-1) - next_start);
7484 read_unlock(&map_tree->lock);
7485 }
7486 out:
7487 return ret;
7488 }
7489
7490 static void readahead_tree_node_children(struct extent_buffer *node)
7491 {
7492 int i;
7493 const int nr_items = btrfs_header_nritems(node);
7494
7495 for (i = 0; i < nr_items; i++)
7496 btrfs_readahead_node_child(node, i);
7497 }
7498
7499 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7500 {
7501 struct btrfs_root *root = fs_info->chunk_root;
7502 struct btrfs_path *path;
7503 struct extent_buffer *leaf;
7504 struct btrfs_key key;
7505 struct btrfs_key found_key;
7506 int ret;
7507 int slot;
7508 int iter_ret = 0;
7509 u64 total_dev = 0;
7510 u64 last_ra_node = 0;
7511
7512 path = btrfs_alloc_path();
7513 if (!path)
7514 return -ENOMEM;
7515
7516
7517
7518
7519
7520 mutex_lock(&uuid_mutex);
7521
7522
7523
7524
7525
7526
7527
7528 fs_info->fs_devices->total_rw_bytes = 0;
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7541 path->skip_locking = 1;
7542
7543
7544
7545
7546
7547
7548
7549 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7550 key.offset = 0;
7551 key.type = 0;
7552 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
7553 struct extent_buffer *node = path->nodes[1];
7554
7555 leaf = path->nodes[0];
7556 slot = path->slots[0];
7557
7558 if (node) {
7559 if (last_ra_node != node->start) {
7560 readahead_tree_node_children(node);
7561 last_ra_node = node->start;
7562 }
7563 }
7564 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7565 struct btrfs_dev_item *dev_item;
7566 dev_item = btrfs_item_ptr(leaf, slot,
7567 struct btrfs_dev_item);
7568 ret = read_one_dev(leaf, dev_item);
7569 if (ret)
7570 goto error;
7571 total_dev++;
7572 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7573 struct btrfs_chunk *chunk;
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7584 ret = read_one_chunk(&found_key, leaf, chunk);
7585 if (ret)
7586 goto error;
7587 }
7588 }
7589
7590 if (iter_ret < 0) {
7591 ret = iter_ret;
7592 goto error;
7593 }
7594
7595
7596
7597
7598
7599 if (total_dev != fs_info->fs_devices->total_devices) {
7600 btrfs_warn(fs_info,
7601 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7602 btrfs_super_num_devices(fs_info->super_copy),
7603 total_dev);
7604 fs_info->fs_devices->total_devices = total_dev;
7605 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7606 }
7607 if (btrfs_super_total_bytes(fs_info->super_copy) <
7608 fs_info->fs_devices->total_rw_bytes) {
7609 btrfs_err(fs_info,
7610 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7611 btrfs_super_total_bytes(fs_info->super_copy),
7612 fs_info->fs_devices->total_rw_bytes);
7613 ret = -EINVAL;
7614 goto error;
7615 }
7616 ret = 0;
7617 error:
7618 mutex_unlock(&uuid_mutex);
7619
7620 btrfs_free_path(path);
7621 return ret;
7622 }
7623
7624 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7625 {
7626 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7627 struct btrfs_device *device;
7628
7629 fs_devices->fs_info = fs_info;
7630
7631 mutex_lock(&fs_devices->device_list_mutex);
7632 list_for_each_entry(device, &fs_devices->devices, dev_list)
7633 device->fs_info = fs_info;
7634
7635 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7636 list_for_each_entry(device, &seed_devs->devices, dev_list)
7637 device->fs_info = fs_info;
7638
7639 seed_devs->fs_info = fs_info;
7640 }
7641 mutex_unlock(&fs_devices->device_list_mutex);
7642 }
7643
7644 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7645 const struct btrfs_dev_stats_item *ptr,
7646 int index)
7647 {
7648 u64 val;
7649
7650 read_extent_buffer(eb, &val,
7651 offsetof(struct btrfs_dev_stats_item, values) +
7652 ((unsigned long)ptr) + (index * sizeof(u64)),
7653 sizeof(val));
7654 return val;
7655 }
7656
7657 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7658 struct btrfs_dev_stats_item *ptr,
7659 int index, u64 val)
7660 {
7661 write_extent_buffer(eb, &val,
7662 offsetof(struct btrfs_dev_stats_item, values) +
7663 ((unsigned long)ptr) + (index * sizeof(u64)),
7664 sizeof(val));
7665 }
7666
7667 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7668 struct btrfs_path *path)
7669 {
7670 struct btrfs_dev_stats_item *ptr;
7671 struct extent_buffer *eb;
7672 struct btrfs_key key;
7673 int item_size;
7674 int i, ret, slot;
7675
7676 if (!device->fs_info->dev_root)
7677 return 0;
7678
7679 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7680 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7681 key.offset = device->devid;
7682 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7683 if (ret) {
7684 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7685 btrfs_dev_stat_set(device, i, 0);
7686 device->dev_stats_valid = 1;
7687 btrfs_release_path(path);
7688 return ret < 0 ? ret : 0;
7689 }
7690 slot = path->slots[0];
7691 eb = path->nodes[0];
7692 item_size = btrfs_item_size(eb, slot);
7693
7694 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7695
7696 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7697 if (item_size >= (1 + i) * sizeof(__le64))
7698 btrfs_dev_stat_set(device, i,
7699 btrfs_dev_stats_value(eb, ptr, i));
7700 else
7701 btrfs_dev_stat_set(device, i, 0);
7702 }
7703
7704 device->dev_stats_valid = 1;
7705 btrfs_dev_stat_print_on_load(device);
7706 btrfs_release_path(path);
7707
7708 return 0;
7709 }
7710
7711 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7712 {
7713 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7714 struct btrfs_device *device;
7715 struct btrfs_path *path = NULL;
7716 int ret = 0;
7717
7718 path = btrfs_alloc_path();
7719 if (!path)
7720 return -ENOMEM;
7721
7722 mutex_lock(&fs_devices->device_list_mutex);
7723 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7724 ret = btrfs_device_init_dev_stats(device, path);
7725 if (ret)
7726 goto out;
7727 }
7728 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7729 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7730 ret = btrfs_device_init_dev_stats(device, path);
7731 if (ret)
7732 goto out;
7733 }
7734 }
7735 out:
7736 mutex_unlock(&fs_devices->device_list_mutex);
7737
7738 btrfs_free_path(path);
7739 return ret;
7740 }
7741
7742 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7743 struct btrfs_device *device)
7744 {
7745 struct btrfs_fs_info *fs_info = trans->fs_info;
7746 struct btrfs_root *dev_root = fs_info->dev_root;
7747 struct btrfs_path *path;
7748 struct btrfs_key key;
7749 struct extent_buffer *eb;
7750 struct btrfs_dev_stats_item *ptr;
7751 int ret;
7752 int i;
7753
7754 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7755 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7756 key.offset = device->devid;
7757
7758 path = btrfs_alloc_path();
7759 if (!path)
7760 return -ENOMEM;
7761 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7762 if (ret < 0) {
7763 btrfs_warn_in_rcu(fs_info,
7764 "error %d while searching for dev_stats item for device %s",
7765 ret, rcu_str_deref(device->name));
7766 goto out;
7767 }
7768
7769 if (ret == 0 &&
7770 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7771
7772 ret = btrfs_del_item(trans, dev_root, path);
7773 if (ret != 0) {
7774 btrfs_warn_in_rcu(fs_info,
7775 "delete too small dev_stats item for device %s failed %d",
7776 rcu_str_deref(device->name), ret);
7777 goto out;
7778 }
7779 ret = 1;
7780 }
7781
7782 if (ret == 1) {
7783
7784 btrfs_release_path(path);
7785 ret = btrfs_insert_empty_item(trans, dev_root, path,
7786 &key, sizeof(*ptr));
7787 if (ret < 0) {
7788 btrfs_warn_in_rcu(fs_info,
7789 "insert dev_stats item for device %s failed %d",
7790 rcu_str_deref(device->name), ret);
7791 goto out;
7792 }
7793 }
7794
7795 eb = path->nodes[0];
7796 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7797 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7798 btrfs_set_dev_stats_value(eb, ptr, i,
7799 btrfs_dev_stat_read(device, i));
7800 btrfs_mark_buffer_dirty(eb);
7801
7802 out:
7803 btrfs_free_path(path);
7804 return ret;
7805 }
7806
7807
7808
7809
7810 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7811 {
7812 struct btrfs_fs_info *fs_info = trans->fs_info;
7813 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7814 struct btrfs_device *device;
7815 int stats_cnt;
7816 int ret = 0;
7817
7818 mutex_lock(&fs_devices->device_list_mutex);
7819 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7820 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7821 if (!device->dev_stats_valid || stats_cnt == 0)
7822 continue;
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836 smp_rmb();
7837
7838 ret = update_dev_stat_item(trans, device);
7839 if (!ret)
7840 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7841 }
7842 mutex_unlock(&fs_devices->device_list_mutex);
7843
7844 return ret;
7845 }
7846
7847 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7848 {
7849 btrfs_dev_stat_inc(dev, index);
7850
7851 if (!dev->dev_stats_valid)
7852 return;
7853 btrfs_err_rl_in_rcu(dev->fs_info,
7854 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7855 rcu_str_deref(dev->name),
7856 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7857 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7858 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7859 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7860 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7861 }
7862
7863 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7864 {
7865 int i;
7866
7867 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7868 if (btrfs_dev_stat_read(dev, i) != 0)
7869 break;
7870 if (i == BTRFS_DEV_STAT_VALUES_MAX)
7871 return;
7872
7873 btrfs_info_in_rcu(dev->fs_info,
7874 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7875 rcu_str_deref(dev->name),
7876 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7877 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7878 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7879 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7880 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7881 }
7882
7883 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7884 struct btrfs_ioctl_get_dev_stats *stats)
7885 {
7886 BTRFS_DEV_LOOKUP_ARGS(args);
7887 struct btrfs_device *dev;
7888 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7889 int i;
7890
7891 mutex_lock(&fs_devices->device_list_mutex);
7892 args.devid = stats->devid;
7893 dev = btrfs_find_device(fs_info->fs_devices, &args);
7894 mutex_unlock(&fs_devices->device_list_mutex);
7895
7896 if (!dev) {
7897 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7898 return -ENODEV;
7899 } else if (!dev->dev_stats_valid) {
7900 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7901 return -ENODEV;
7902 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7903 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7904 if (stats->nr_items > i)
7905 stats->values[i] =
7906 btrfs_dev_stat_read_and_reset(dev, i);
7907 else
7908 btrfs_dev_stat_set(dev, i, 0);
7909 }
7910 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7911 current->comm, task_pid_nr(current));
7912 } else {
7913 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7914 if (stats->nr_items > i)
7915 stats->values[i] = btrfs_dev_stat_read(dev, i);
7916 }
7917 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7918 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7919 return 0;
7920 }
7921
7922
7923
7924
7925
7926
7927
7928
7929 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7930 {
7931 struct btrfs_device *curr, *next;
7932
7933 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7934
7935 if (list_empty(&trans->dev_update_list))
7936 return;
7937
7938
7939
7940
7941
7942
7943 mutex_lock(&trans->fs_info->chunk_mutex);
7944 list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7945 post_commit_list) {
7946 list_del_init(&curr->post_commit_list);
7947 curr->commit_total_bytes = curr->disk_total_bytes;
7948 curr->commit_bytes_used = curr->bytes_used;
7949 }
7950 mutex_unlock(&trans->fs_info->chunk_mutex);
7951 }
7952
7953
7954
7955
7956 int btrfs_bg_type_to_factor(u64 flags)
7957 {
7958 const int index = btrfs_bg_flags_to_raid_index(flags);
7959
7960 return btrfs_raid_array[index].ncopies;
7961 }
7962
7963
7964
7965 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7966 u64 chunk_offset, u64 devid,
7967 u64 physical_offset, u64 physical_len)
7968 {
7969 struct btrfs_dev_lookup_args args = { .devid = devid };
7970 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7971 struct extent_map *em;
7972 struct map_lookup *map;
7973 struct btrfs_device *dev;
7974 u64 stripe_len;
7975 bool found = false;
7976 int ret = 0;
7977 int i;
7978
7979 read_lock(&em_tree->lock);
7980 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7981 read_unlock(&em_tree->lock);
7982
7983 if (!em) {
7984 btrfs_err(fs_info,
7985 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7986 physical_offset, devid);
7987 ret = -EUCLEAN;
7988 goto out;
7989 }
7990
7991 map = em->map_lookup;
7992 stripe_len = btrfs_calc_stripe_length(em);
7993 if (physical_len != stripe_len) {
7994 btrfs_err(fs_info,
7995 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7996 physical_offset, devid, em->start, physical_len,
7997 stripe_len);
7998 ret = -EUCLEAN;
7999 goto out;
8000 }
8001
8002
8003
8004
8005
8006
8007 if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
8008 btrfs_warn(fs_info,
8009 "devid %llu physical %llu len %llu inside the reserved space",
8010 devid, physical_offset, physical_len);
8011
8012 for (i = 0; i < map->num_stripes; i++) {
8013 if (map->stripes[i].dev->devid == devid &&
8014 map->stripes[i].physical == physical_offset) {
8015 found = true;
8016 if (map->verified_stripes >= map->num_stripes) {
8017 btrfs_err(fs_info,
8018 "too many dev extents for chunk %llu found",
8019 em->start);
8020 ret = -EUCLEAN;
8021 goto out;
8022 }
8023 map->verified_stripes++;
8024 break;
8025 }
8026 }
8027 if (!found) {
8028 btrfs_err(fs_info,
8029 "dev extent physical offset %llu devid %llu has no corresponding chunk",
8030 physical_offset, devid);
8031 ret = -EUCLEAN;
8032 }
8033
8034
8035 dev = btrfs_find_device(fs_info->fs_devices, &args);
8036 if (!dev) {
8037 btrfs_err(fs_info, "failed to find devid %llu", devid);
8038 ret = -EUCLEAN;
8039 goto out;
8040 }
8041
8042 if (physical_offset + physical_len > dev->disk_total_bytes) {
8043 btrfs_err(fs_info,
8044 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
8045 devid, physical_offset, physical_len,
8046 dev->disk_total_bytes);
8047 ret = -EUCLEAN;
8048 goto out;
8049 }
8050
8051 if (dev->zone_info) {
8052 u64 zone_size = dev->zone_info->zone_size;
8053
8054 if (!IS_ALIGNED(physical_offset, zone_size) ||
8055 !IS_ALIGNED(physical_len, zone_size)) {
8056 btrfs_err(fs_info,
8057 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
8058 devid, physical_offset, physical_len);
8059 ret = -EUCLEAN;
8060 goto out;
8061 }
8062 }
8063
8064 out:
8065 free_extent_map(em);
8066 return ret;
8067 }
8068
8069 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
8070 {
8071 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
8072 struct extent_map *em;
8073 struct rb_node *node;
8074 int ret = 0;
8075
8076 read_lock(&em_tree->lock);
8077 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
8078 em = rb_entry(node, struct extent_map, rb_node);
8079 if (em->map_lookup->num_stripes !=
8080 em->map_lookup->verified_stripes) {
8081 btrfs_err(fs_info,
8082 "chunk %llu has missing dev extent, have %d expect %d",
8083 em->start, em->map_lookup->verified_stripes,
8084 em->map_lookup->num_stripes);
8085 ret = -EUCLEAN;
8086 goto out;
8087 }
8088 }
8089 out:
8090 read_unlock(&em_tree->lock);
8091 return ret;
8092 }
8093
8094
8095
8096
8097
8098
8099
8100
8101 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
8102 {
8103 struct btrfs_path *path;
8104 struct btrfs_root *root = fs_info->dev_root;
8105 struct btrfs_key key;
8106 u64 prev_devid = 0;
8107 u64 prev_dev_ext_end = 0;
8108 int ret = 0;
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120 if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
8121 return 0;
8122
8123 key.objectid = 1;
8124 key.type = BTRFS_DEV_EXTENT_KEY;
8125 key.offset = 0;
8126
8127 path = btrfs_alloc_path();
8128 if (!path)
8129 return -ENOMEM;
8130
8131 path->reada = READA_FORWARD;
8132 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8133 if (ret < 0)
8134 goto out;
8135
8136 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8137 ret = btrfs_next_leaf(root, path);
8138 if (ret < 0)
8139 goto out;
8140
8141 if (ret > 0) {
8142 ret = -EUCLEAN;
8143 goto out;
8144 }
8145 }
8146 while (1) {
8147 struct extent_buffer *leaf = path->nodes[0];
8148 struct btrfs_dev_extent *dext;
8149 int slot = path->slots[0];
8150 u64 chunk_offset;
8151 u64 physical_offset;
8152 u64 physical_len;
8153 u64 devid;
8154
8155 btrfs_item_key_to_cpu(leaf, &key, slot);
8156 if (key.type != BTRFS_DEV_EXTENT_KEY)
8157 break;
8158 devid = key.objectid;
8159 physical_offset = key.offset;
8160
8161 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8162 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8163 physical_len = btrfs_dev_extent_length(leaf, dext);
8164
8165
8166 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
8167 btrfs_err(fs_info,
8168 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
8169 devid, physical_offset, prev_dev_ext_end);
8170 ret = -EUCLEAN;
8171 goto out;
8172 }
8173
8174 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8175 physical_offset, physical_len);
8176 if (ret < 0)
8177 goto out;
8178 prev_devid = devid;
8179 prev_dev_ext_end = physical_offset + physical_len;
8180
8181 ret = btrfs_next_item(root, path);
8182 if (ret < 0)
8183 goto out;
8184 if (ret > 0) {
8185 ret = 0;
8186 break;
8187 }
8188 }
8189
8190
8191 ret = verify_chunk_dev_extent_mapping(fs_info);
8192 out:
8193 btrfs_free_path(path);
8194 return ret;
8195 }
8196
8197
8198
8199
8200
8201 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8202 {
8203 struct btrfs_swapfile_pin *sp;
8204 struct rb_node *node;
8205
8206 spin_lock(&fs_info->swapfile_pins_lock);
8207 node = fs_info->swapfile_pins.rb_node;
8208 while (node) {
8209 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8210 if (ptr < sp->ptr)
8211 node = node->rb_left;
8212 else if (ptr > sp->ptr)
8213 node = node->rb_right;
8214 else
8215 break;
8216 }
8217 spin_unlock(&fs_info->swapfile_pins_lock);
8218 return node != NULL;
8219 }
8220
8221 static int relocating_repair_kthread(void *data)
8222 {
8223 struct btrfs_block_group *cache = data;
8224 struct btrfs_fs_info *fs_info = cache->fs_info;
8225 u64 target;
8226 int ret = 0;
8227
8228 target = cache->start;
8229 btrfs_put_block_group(cache);
8230
8231 sb_start_write(fs_info->sb);
8232 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8233 btrfs_info(fs_info,
8234 "zoned: skip relocating block group %llu to repair: EBUSY",
8235 target);
8236 sb_end_write(fs_info->sb);
8237 return -EBUSY;
8238 }
8239
8240 mutex_lock(&fs_info->reclaim_bgs_lock);
8241
8242
8243 cache = btrfs_lookup_block_group(fs_info, target);
8244 if (!cache)
8245 goto out;
8246
8247 if (!cache->relocating_repair)
8248 goto out;
8249
8250 ret = btrfs_may_alloc_data_chunk(fs_info, target);
8251 if (ret < 0)
8252 goto out;
8253
8254 btrfs_info(fs_info,
8255 "zoned: relocating block group %llu to repair IO failure",
8256 target);
8257 ret = btrfs_relocate_chunk(fs_info, target);
8258
8259 out:
8260 if (cache)
8261 btrfs_put_block_group(cache);
8262 mutex_unlock(&fs_info->reclaim_bgs_lock);
8263 btrfs_exclop_finish(fs_info);
8264 sb_end_write(fs_info->sb);
8265
8266 return ret;
8267 }
8268
8269 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8270 {
8271 struct btrfs_block_group *cache;
8272
8273 if (!btrfs_is_zoned(fs_info))
8274 return false;
8275
8276
8277 if (btrfs_test_opt(fs_info, DEGRADED))
8278 return true;
8279
8280 cache = btrfs_lookup_block_group(fs_info, logical);
8281 if (!cache)
8282 return true;
8283
8284 spin_lock(&cache->lock);
8285 if (cache->relocating_repair) {
8286 spin_unlock(&cache->lock);
8287 btrfs_put_block_group(cache);
8288 return true;
8289 }
8290 cache->relocating_repair = 1;
8291 spin_unlock(&cache->lock);
8292
8293 kthread_run(relocating_repair_kthread, cache,
8294 "btrfs-relocating-repair");
8295
8296 return true;
8297 }