0001
0002
0003 #include <linux/bitops.h>
0004 #include <linux/slab.h>
0005 #include <linux/bio.h>
0006 #include <linux/mm.h>
0007 #include <linux/pagemap.h>
0008 #include <linux/page-flags.h>
0009 #include <linux/sched/mm.h>
0010 #include <linux/spinlock.h>
0011 #include <linux/blkdev.h>
0012 #include <linux/swap.h>
0013 #include <linux/writeback.h>
0014 #include <linux/pagevec.h>
0015 #include <linux/prefetch.h>
0016 #include <linux/fsverity.h>
0017 #include "misc.h"
0018 #include "extent_io.h"
0019 #include "extent-io-tree.h"
0020 #include "extent_map.h"
0021 #include "ctree.h"
0022 #include "btrfs_inode.h"
0023 #include "volumes.h"
0024 #include "check-integrity.h"
0025 #include "locking.h"
0026 #include "rcu-string.h"
0027 #include "backref.h"
0028 #include "disk-io.h"
0029 #include "subpage.h"
0030 #include "zoned.h"
0031 #include "block-group.h"
0032 #include "compression.h"
0033
0034 static struct kmem_cache *extent_state_cache;
0035 static struct kmem_cache *extent_buffer_cache;
0036 static struct bio_set btrfs_bioset;
0037
0038 static inline bool extent_state_in_tree(const struct extent_state *state)
0039 {
0040 return !RB_EMPTY_NODE(&state->rb_node);
0041 }
0042
0043 #ifdef CONFIG_BTRFS_DEBUG
0044 static LIST_HEAD(states);
0045 static DEFINE_SPINLOCK(leak_lock);
0046
0047 static inline void btrfs_leak_debug_add(spinlock_t *lock,
0048 struct list_head *new,
0049 struct list_head *head)
0050 {
0051 unsigned long flags;
0052
0053 spin_lock_irqsave(lock, flags);
0054 list_add(new, head);
0055 spin_unlock_irqrestore(lock, flags);
0056 }
0057
0058 static inline void btrfs_leak_debug_del(spinlock_t *lock,
0059 struct list_head *entry)
0060 {
0061 unsigned long flags;
0062
0063 spin_lock_irqsave(lock, flags);
0064 list_del(entry);
0065 spin_unlock_irqrestore(lock, flags);
0066 }
0067
0068 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
0069 {
0070 struct extent_buffer *eb;
0071 unsigned long flags;
0072
0073
0074
0075
0076
0077 if (!fs_info->allocated_ebs.next)
0078 return;
0079
0080 WARN_ON(!list_empty(&fs_info->allocated_ebs));
0081 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
0082 while (!list_empty(&fs_info->allocated_ebs)) {
0083 eb = list_first_entry(&fs_info->allocated_ebs,
0084 struct extent_buffer, leak_list);
0085 pr_err(
0086 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
0087 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
0088 btrfs_header_owner(eb));
0089 list_del(&eb->leak_list);
0090 kmem_cache_free(extent_buffer_cache, eb);
0091 }
0092 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
0093 }
0094
0095 static inline void btrfs_extent_state_leak_debug_check(void)
0096 {
0097 struct extent_state *state;
0098
0099 while (!list_empty(&states)) {
0100 state = list_entry(states.next, struct extent_state, leak_list);
0101 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
0102 state->start, state->end, state->state,
0103 extent_state_in_tree(state),
0104 refcount_read(&state->refs));
0105 list_del(&state->leak_list);
0106 kmem_cache_free(extent_state_cache, state);
0107 }
0108 }
0109
0110 #define btrfs_debug_check_extent_io_range(tree, start, end) \
0111 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
0112 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
0113 struct extent_io_tree *tree, u64 start, u64 end)
0114 {
0115 struct inode *inode = tree->private_data;
0116 u64 isize;
0117
0118 if (!inode || !is_data_inode(inode))
0119 return;
0120
0121 isize = i_size_read(inode);
0122 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
0123 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
0124 "%s: ino %llu isize %llu odd range [%llu,%llu]",
0125 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
0126 }
0127 }
0128 #else
0129 #define btrfs_leak_debug_add(lock, new, head) do {} while (0)
0130 #define btrfs_leak_debug_del(lock, entry) do {} while (0)
0131 #define btrfs_extent_state_leak_debug_check() do {} while (0)
0132 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
0133 #endif
0134
0135 struct tree_entry {
0136 u64 start;
0137 u64 end;
0138 struct rb_node rb_node;
0139 };
0140
0141
0142
0143
0144
0145 struct btrfs_bio_ctrl {
0146 struct bio *bio;
0147 int mirror_num;
0148 enum btrfs_compression_type compress_type;
0149 u32 len_to_stripe_boundary;
0150 u32 len_to_oe_boundary;
0151 };
0152
0153 struct extent_page_data {
0154 struct btrfs_bio_ctrl bio_ctrl;
0155
0156
0157
0158 unsigned int extent_locked:1;
0159
0160
0161 unsigned int sync_io:1;
0162 };
0163
0164 static int add_extent_changeset(struct extent_state *state, u32 bits,
0165 struct extent_changeset *changeset,
0166 int set)
0167 {
0168 int ret;
0169
0170 if (!changeset)
0171 return 0;
0172 if (set && (state->state & bits) == bits)
0173 return 0;
0174 if (!set && (state->state & bits) == 0)
0175 return 0;
0176 changeset->bytes_changed += state->end - state->start + 1;
0177 ret = ulist_add(&changeset->range_changed, state->start, state->end,
0178 GFP_ATOMIC);
0179 return ret;
0180 }
0181
0182 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
0183 {
0184 struct bio *bio;
0185 struct bio_vec *bv;
0186 struct inode *inode;
0187 int mirror_num;
0188
0189 if (!bio_ctrl->bio)
0190 return;
0191
0192 bio = bio_ctrl->bio;
0193 bv = bio_first_bvec_all(bio);
0194 inode = bv->bv_page->mapping->host;
0195 mirror_num = bio_ctrl->mirror_num;
0196
0197
0198 ASSERT(bio->bi_iter.bi_size);
0199
0200 btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
0201
0202 if (!is_data_inode(inode))
0203 btrfs_submit_metadata_bio(inode, bio, mirror_num);
0204 else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
0205 btrfs_submit_data_write_bio(inode, bio, mirror_num);
0206 else
0207 btrfs_submit_data_read_bio(inode, bio, mirror_num,
0208 bio_ctrl->compress_type);
0209
0210
0211 bio_ctrl->bio = NULL;
0212 }
0213
0214
0215
0216
0217 static void submit_write_bio(struct extent_page_data *epd, int ret)
0218 {
0219 struct bio *bio = epd->bio_ctrl.bio;
0220
0221 if (!bio)
0222 return;
0223
0224 if (ret) {
0225 ASSERT(ret < 0);
0226 bio->bi_status = errno_to_blk_status(ret);
0227 bio_endio(bio);
0228
0229 epd->bio_ctrl.bio = NULL;
0230 } else {
0231 submit_one_bio(&epd->bio_ctrl);
0232 }
0233 }
0234
0235 int __init extent_state_cache_init(void)
0236 {
0237 extent_state_cache = kmem_cache_create("btrfs_extent_state",
0238 sizeof(struct extent_state), 0,
0239 SLAB_MEM_SPREAD, NULL);
0240 if (!extent_state_cache)
0241 return -ENOMEM;
0242 return 0;
0243 }
0244
0245 int __init extent_io_init(void)
0246 {
0247 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
0248 sizeof(struct extent_buffer), 0,
0249 SLAB_MEM_SPREAD, NULL);
0250 if (!extent_buffer_cache)
0251 return -ENOMEM;
0252
0253 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
0254 offsetof(struct btrfs_bio, bio),
0255 BIOSET_NEED_BVECS))
0256 goto free_buffer_cache;
0257
0258 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
0259 goto free_bioset;
0260
0261 return 0;
0262
0263 free_bioset:
0264 bioset_exit(&btrfs_bioset);
0265
0266 free_buffer_cache:
0267 kmem_cache_destroy(extent_buffer_cache);
0268 extent_buffer_cache = NULL;
0269 return -ENOMEM;
0270 }
0271
0272 void __cold extent_state_cache_exit(void)
0273 {
0274 btrfs_extent_state_leak_debug_check();
0275 kmem_cache_destroy(extent_state_cache);
0276 }
0277
0278 void __cold extent_io_exit(void)
0279 {
0280
0281
0282
0283
0284 rcu_barrier();
0285 kmem_cache_destroy(extent_buffer_cache);
0286 bioset_exit(&btrfs_bioset);
0287 }
0288
0289
0290
0291
0292
0293
0294
0295
0296 static struct lock_class_key file_extent_tree_class;
0297
0298 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
0299 struct extent_io_tree *tree, unsigned int owner,
0300 void *private_data)
0301 {
0302 tree->fs_info = fs_info;
0303 tree->state = RB_ROOT;
0304 tree->dirty_bytes = 0;
0305 spin_lock_init(&tree->lock);
0306 tree->private_data = private_data;
0307 tree->owner = owner;
0308 if (owner == IO_TREE_INODE_FILE_EXTENT)
0309 lockdep_set_class(&tree->lock, &file_extent_tree_class);
0310 }
0311
0312 void extent_io_tree_release(struct extent_io_tree *tree)
0313 {
0314 spin_lock(&tree->lock);
0315
0316
0317
0318
0319
0320 smp_mb();
0321 while (!RB_EMPTY_ROOT(&tree->state)) {
0322 struct rb_node *node;
0323 struct extent_state *state;
0324
0325 node = rb_first(&tree->state);
0326 state = rb_entry(node, struct extent_state, rb_node);
0327 rb_erase(&state->rb_node, &tree->state);
0328 RB_CLEAR_NODE(&state->rb_node);
0329
0330
0331
0332
0333 ASSERT(!waitqueue_active(&state->wq));
0334 free_extent_state(state);
0335
0336 cond_resched_lock(&tree->lock);
0337 }
0338 spin_unlock(&tree->lock);
0339 }
0340
0341 static struct extent_state *alloc_extent_state(gfp_t mask)
0342 {
0343 struct extent_state *state;
0344
0345
0346
0347
0348
0349 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
0350 state = kmem_cache_alloc(extent_state_cache, mask);
0351 if (!state)
0352 return state;
0353 state->state = 0;
0354 state->failrec = NULL;
0355 RB_CLEAR_NODE(&state->rb_node);
0356 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
0357 refcount_set(&state->refs, 1);
0358 init_waitqueue_head(&state->wq);
0359 trace_alloc_extent_state(state, mask, _RET_IP_);
0360 return state;
0361 }
0362
0363 void free_extent_state(struct extent_state *state)
0364 {
0365 if (!state)
0366 return;
0367 if (refcount_dec_and_test(&state->refs)) {
0368 WARN_ON(extent_state_in_tree(state));
0369 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
0370 trace_free_extent_state(state, _RET_IP_);
0371 kmem_cache_free(extent_state_cache, state);
0372 }
0373 }
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392 static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
0393 u64 offset,
0394 struct rb_node ***node_ret,
0395 struct rb_node **parent_ret)
0396 {
0397 struct rb_root *root = &tree->state;
0398 struct rb_node **node = &root->rb_node;
0399 struct rb_node *prev = NULL;
0400 struct tree_entry *entry;
0401
0402 while (*node) {
0403 prev = *node;
0404 entry = rb_entry(prev, struct tree_entry, rb_node);
0405
0406 if (offset < entry->start)
0407 node = &(*node)->rb_left;
0408 else if (offset > entry->end)
0409 node = &(*node)->rb_right;
0410 else
0411 return *node;
0412 }
0413
0414 if (node_ret)
0415 *node_ret = node;
0416 if (parent_ret)
0417 *parent_ret = prev;
0418
0419
0420 while (prev && offset > entry->end) {
0421 prev = rb_next(prev);
0422 entry = rb_entry(prev, struct tree_entry, rb_node);
0423 }
0424
0425 return prev;
0426 }
0427
0428
0429
0430
0431 static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
0432 {
0433 return tree_search_for_insert(tree, offset, NULL, NULL);
0434 }
0435
0436
0437
0438
0439
0440
0441
0442
0443
0444
0445
0446
0447
0448 static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
0449 u64 offset,
0450 struct rb_node **prev_ret,
0451 struct rb_node **next_ret)
0452 {
0453 struct rb_root *root = &tree->state;
0454 struct rb_node **node = &root->rb_node;
0455 struct rb_node *prev = NULL;
0456 struct rb_node *orig_prev = NULL;
0457 struct tree_entry *entry;
0458
0459 ASSERT(prev_ret);
0460 ASSERT(next_ret);
0461
0462 while (*node) {
0463 prev = *node;
0464 entry = rb_entry(prev, struct tree_entry, rb_node);
0465
0466 if (offset < entry->start)
0467 node = &(*node)->rb_left;
0468 else if (offset > entry->end)
0469 node = &(*node)->rb_right;
0470 else
0471 return *node;
0472 }
0473
0474 orig_prev = prev;
0475 while (prev && offset > entry->end) {
0476 prev = rb_next(prev);
0477 entry = rb_entry(prev, struct tree_entry, rb_node);
0478 }
0479 *next_ret = prev;
0480 prev = orig_prev;
0481
0482 entry = rb_entry(prev, struct tree_entry, rb_node);
0483 while (prev && offset < entry->start) {
0484 prev = rb_prev(prev);
0485 entry = rb_entry(prev, struct tree_entry, rb_node);
0486 }
0487 *prev_ret = prev;
0488
0489 return NULL;
0490 }
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501 static void merge_state(struct extent_io_tree *tree,
0502 struct extent_state *state)
0503 {
0504 struct extent_state *other;
0505 struct rb_node *other_node;
0506
0507 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
0508 return;
0509
0510 other_node = rb_prev(&state->rb_node);
0511 if (other_node) {
0512 other = rb_entry(other_node, struct extent_state, rb_node);
0513 if (other->end == state->start - 1 &&
0514 other->state == state->state) {
0515 if (tree->private_data &&
0516 is_data_inode(tree->private_data))
0517 btrfs_merge_delalloc_extent(tree->private_data,
0518 state, other);
0519 state->start = other->start;
0520 rb_erase(&other->rb_node, &tree->state);
0521 RB_CLEAR_NODE(&other->rb_node);
0522 free_extent_state(other);
0523 }
0524 }
0525 other_node = rb_next(&state->rb_node);
0526 if (other_node) {
0527 other = rb_entry(other_node, struct extent_state, rb_node);
0528 if (other->start == state->end + 1 &&
0529 other->state == state->state) {
0530 if (tree->private_data &&
0531 is_data_inode(tree->private_data))
0532 btrfs_merge_delalloc_extent(tree->private_data,
0533 state, other);
0534 state->end = other->end;
0535 rb_erase(&other->rb_node, &tree->state);
0536 RB_CLEAR_NODE(&other->rb_node);
0537 free_extent_state(other);
0538 }
0539 }
0540 }
0541
0542 static void set_state_bits(struct extent_io_tree *tree,
0543 struct extent_state *state, u32 bits,
0544 struct extent_changeset *changeset);
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556 static int insert_state(struct extent_io_tree *tree,
0557 struct extent_state *state,
0558 u32 bits, struct extent_changeset *changeset)
0559 {
0560 struct rb_node **node;
0561 struct rb_node *parent;
0562 const u64 end = state->end;
0563
0564 set_state_bits(tree, state, bits, changeset);
0565
0566 node = &tree->state.rb_node;
0567 while (*node) {
0568 struct tree_entry *entry;
0569
0570 parent = *node;
0571 entry = rb_entry(parent, struct tree_entry, rb_node);
0572
0573 if (end < entry->start) {
0574 node = &(*node)->rb_left;
0575 } else if (end > entry->end) {
0576 node = &(*node)->rb_right;
0577 } else {
0578 btrfs_err(tree->fs_info,
0579 "found node %llu %llu on insert of %llu %llu",
0580 entry->start, entry->end, state->start, end);
0581 return -EEXIST;
0582 }
0583 }
0584
0585 rb_link_node(&state->rb_node, parent, node);
0586 rb_insert_color(&state->rb_node, &tree->state);
0587
0588 merge_state(tree, state);
0589 return 0;
0590 }
0591
0592
0593
0594
0595 static void insert_state_fast(struct extent_io_tree *tree,
0596 struct extent_state *state, struct rb_node **node,
0597 struct rb_node *parent, unsigned bits,
0598 struct extent_changeset *changeset)
0599 {
0600 set_state_bits(tree, state, bits, changeset);
0601 rb_link_node(&state->rb_node, parent, node);
0602 rb_insert_color(&state->rb_node, &tree->state);
0603 merge_state(tree, state);
0604 }
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
0621 struct extent_state *prealloc, u64 split)
0622 {
0623 struct rb_node *parent = NULL;
0624 struct rb_node **node;
0625
0626 if (tree->private_data && is_data_inode(tree->private_data))
0627 btrfs_split_delalloc_extent(tree->private_data, orig, split);
0628
0629 prealloc->start = orig->start;
0630 prealloc->end = split - 1;
0631 prealloc->state = orig->state;
0632 orig->start = split;
0633
0634 parent = &orig->rb_node;
0635 node = &parent;
0636 while (*node) {
0637 struct tree_entry *entry;
0638
0639 parent = *node;
0640 entry = rb_entry(parent, struct tree_entry, rb_node);
0641
0642 if (prealloc->end < entry->start) {
0643 node = &(*node)->rb_left;
0644 } else if (prealloc->end > entry->end) {
0645 node = &(*node)->rb_right;
0646 } else {
0647 free_extent_state(prealloc);
0648 return -EEXIST;
0649 }
0650 }
0651
0652 rb_link_node(&prealloc->rb_node, parent, node);
0653 rb_insert_color(&prealloc->rb_node, &tree->state);
0654
0655 return 0;
0656 }
0657
0658 static struct extent_state *next_state(struct extent_state *state)
0659 {
0660 struct rb_node *next = rb_next(&state->rb_node);
0661 if (next)
0662 return rb_entry(next, struct extent_state, rb_node);
0663 else
0664 return NULL;
0665 }
0666
0667
0668
0669
0670
0671
0672
0673
0674 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
0675 struct extent_state *state,
0676 u32 bits, int wake,
0677 struct extent_changeset *changeset)
0678 {
0679 struct extent_state *next;
0680 u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
0681 int ret;
0682
0683 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
0684 u64 range = state->end - state->start + 1;
0685 WARN_ON(range > tree->dirty_bytes);
0686 tree->dirty_bytes -= range;
0687 }
0688
0689 if (tree->private_data && is_data_inode(tree->private_data))
0690 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
0691
0692 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
0693 BUG_ON(ret < 0);
0694 state->state &= ~bits_to_clear;
0695 if (wake)
0696 wake_up(&state->wq);
0697 if (state->state == 0) {
0698 next = next_state(state);
0699 if (extent_state_in_tree(state)) {
0700 rb_erase(&state->rb_node, &tree->state);
0701 RB_CLEAR_NODE(&state->rb_node);
0702 free_extent_state(state);
0703 } else {
0704 WARN_ON(1);
0705 }
0706 } else {
0707 merge_state(tree, state);
0708 next = next_state(state);
0709 }
0710 return next;
0711 }
0712
0713 static struct extent_state *
0714 alloc_extent_state_atomic(struct extent_state *prealloc)
0715 {
0716 if (!prealloc)
0717 prealloc = alloc_extent_state(GFP_ATOMIC);
0718
0719 return prealloc;
0720 }
0721
0722 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
0723 {
0724 btrfs_panic(tree->fs_info, err,
0725 "locking error: extent tree was modified by another thread while locked");
0726 }
0727
0728
0729
0730
0731
0732
0733
0734
0735
0736
0737
0738
0739
0740 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
0741 u32 bits, int wake, int delete,
0742 struct extent_state **cached_state,
0743 gfp_t mask, struct extent_changeset *changeset)
0744 {
0745 struct extent_state *state;
0746 struct extent_state *cached;
0747 struct extent_state *prealloc = NULL;
0748 struct rb_node *node;
0749 u64 last_end;
0750 int err;
0751 int clear = 0;
0752
0753 btrfs_debug_check_extent_io_range(tree, start, end);
0754 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
0755
0756 if (bits & EXTENT_DELALLOC)
0757 bits |= EXTENT_NORESERVE;
0758
0759 if (delete)
0760 bits |= ~EXTENT_CTLBITS;
0761
0762 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
0763 clear = 1;
0764 again:
0765 if (!prealloc && gfpflags_allow_blocking(mask)) {
0766
0767
0768
0769
0770
0771
0772
0773 prealloc = alloc_extent_state(mask);
0774 }
0775
0776 spin_lock(&tree->lock);
0777 if (cached_state) {
0778 cached = *cached_state;
0779
0780 if (clear) {
0781 *cached_state = NULL;
0782 cached_state = NULL;
0783 }
0784
0785 if (cached && extent_state_in_tree(cached) &&
0786 cached->start <= start && cached->end > start) {
0787 if (clear)
0788 refcount_dec(&cached->refs);
0789 state = cached;
0790 goto hit_next;
0791 }
0792 if (clear)
0793 free_extent_state(cached);
0794 }
0795
0796
0797
0798
0799 node = tree_search(tree, start);
0800 if (!node)
0801 goto out;
0802 state = rb_entry(node, struct extent_state, rb_node);
0803 hit_next:
0804 if (state->start > end)
0805 goto out;
0806 WARN_ON(state->end < start);
0807 last_end = state->end;
0808
0809
0810 if (!(state->state & bits)) {
0811 state = next_state(state);
0812 goto next;
0813 }
0814
0815
0816
0817
0818
0819
0820
0821
0822
0823
0824
0825
0826
0827
0828
0829
0830
0831 if (state->start < start) {
0832 prealloc = alloc_extent_state_atomic(prealloc);
0833 BUG_ON(!prealloc);
0834 err = split_state(tree, state, prealloc, start);
0835 if (err)
0836 extent_io_tree_panic(tree, err);
0837
0838 prealloc = NULL;
0839 if (err)
0840 goto out;
0841 if (state->end <= end) {
0842 state = clear_state_bit(tree, state, bits, wake, changeset);
0843 goto next;
0844 }
0845 goto search_again;
0846 }
0847
0848
0849
0850
0851
0852
0853 if (state->start <= end && state->end > end) {
0854 prealloc = alloc_extent_state_atomic(prealloc);
0855 BUG_ON(!prealloc);
0856 err = split_state(tree, state, prealloc, end + 1);
0857 if (err)
0858 extent_io_tree_panic(tree, err);
0859
0860 if (wake)
0861 wake_up(&state->wq);
0862
0863 clear_state_bit(tree, prealloc, bits, wake, changeset);
0864
0865 prealloc = NULL;
0866 goto out;
0867 }
0868
0869 state = clear_state_bit(tree, state, bits, wake, changeset);
0870 next:
0871 if (last_end == (u64)-1)
0872 goto out;
0873 start = last_end + 1;
0874 if (start <= end && state && !need_resched())
0875 goto hit_next;
0876
0877 search_again:
0878 if (start > end)
0879 goto out;
0880 spin_unlock(&tree->lock);
0881 if (gfpflags_allow_blocking(mask))
0882 cond_resched();
0883 goto again;
0884
0885 out:
0886 spin_unlock(&tree->lock);
0887 if (prealloc)
0888 free_extent_state(prealloc);
0889
0890 return 0;
0891
0892 }
0893
0894 static void wait_on_state(struct extent_io_tree *tree,
0895 struct extent_state *state)
0896 __releases(tree->lock)
0897 __acquires(tree->lock)
0898 {
0899 DEFINE_WAIT(wait);
0900 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
0901 spin_unlock(&tree->lock);
0902 schedule();
0903 spin_lock(&tree->lock);
0904 finish_wait(&state->wq, &wait);
0905 }
0906
0907
0908
0909
0910
0911
0912 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
0913 u32 bits)
0914 {
0915 struct extent_state *state;
0916 struct rb_node *node;
0917
0918 btrfs_debug_check_extent_io_range(tree, start, end);
0919
0920 spin_lock(&tree->lock);
0921 again:
0922 while (1) {
0923
0924
0925
0926
0927 node = tree_search(tree, start);
0928 process_node:
0929 if (!node)
0930 break;
0931
0932 state = rb_entry(node, struct extent_state, rb_node);
0933
0934 if (state->start > end)
0935 goto out;
0936
0937 if (state->state & bits) {
0938 start = state->start;
0939 refcount_inc(&state->refs);
0940 wait_on_state(tree, state);
0941 free_extent_state(state);
0942 goto again;
0943 }
0944 start = state->end + 1;
0945
0946 if (start > end)
0947 break;
0948
0949 if (!cond_resched_lock(&tree->lock)) {
0950 node = rb_next(node);
0951 goto process_node;
0952 }
0953 }
0954 out:
0955 spin_unlock(&tree->lock);
0956 }
0957
0958 static void set_state_bits(struct extent_io_tree *tree,
0959 struct extent_state *state,
0960 u32 bits, struct extent_changeset *changeset)
0961 {
0962 u32 bits_to_set = bits & ~EXTENT_CTLBITS;
0963 int ret;
0964
0965 if (tree->private_data && is_data_inode(tree->private_data))
0966 btrfs_set_delalloc_extent(tree->private_data, state, bits);
0967
0968 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
0969 u64 range = state->end - state->start + 1;
0970 tree->dirty_bytes += range;
0971 }
0972 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
0973 BUG_ON(ret < 0);
0974 state->state |= bits_to_set;
0975 }
0976
0977 static void cache_state_if_flags(struct extent_state *state,
0978 struct extent_state **cached_ptr,
0979 unsigned flags)
0980 {
0981 if (cached_ptr && !(*cached_ptr)) {
0982 if (!flags || (state->state & flags)) {
0983 *cached_ptr = state;
0984 refcount_inc(&state->refs);
0985 }
0986 }
0987 }
0988
0989 static void cache_state(struct extent_state *state,
0990 struct extent_state **cached_ptr)
0991 {
0992 return cache_state_if_flags(state, cached_ptr,
0993 EXTENT_LOCKED | EXTENT_BOUNDARY);
0994 }
0995
0996
0997
0998
0999
1000
1001
1002
1003
1004
1005
1006 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
1007 u32 exclusive_bits, u64 *failed_start,
1008 struct extent_state **cached_state, gfp_t mask,
1009 struct extent_changeset *changeset)
1010 {
1011 struct extent_state *state;
1012 struct extent_state *prealloc = NULL;
1013 struct rb_node *node;
1014 struct rb_node **p;
1015 struct rb_node *parent;
1016 int err = 0;
1017 u64 last_start;
1018 u64 last_end;
1019
1020 btrfs_debug_check_extent_io_range(tree, start, end);
1021 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
1022
1023 if (exclusive_bits)
1024 ASSERT(failed_start);
1025 else
1026 ASSERT(failed_start == NULL);
1027 again:
1028 if (!prealloc && gfpflags_allow_blocking(mask)) {
1029
1030
1031
1032
1033
1034
1035
1036 prealloc = alloc_extent_state(mask);
1037 }
1038
1039 spin_lock(&tree->lock);
1040 if (cached_state && *cached_state) {
1041 state = *cached_state;
1042 if (state->start <= start && state->end > start &&
1043 extent_state_in_tree(state)) {
1044 node = &state->rb_node;
1045 goto hit_next;
1046 }
1047 }
1048
1049
1050
1051
1052 node = tree_search_for_insert(tree, start, &p, &parent);
1053 if (!node) {
1054 prealloc = alloc_extent_state_atomic(prealloc);
1055 BUG_ON(!prealloc);
1056 prealloc->start = start;
1057 prealloc->end = end;
1058 insert_state_fast(tree, prealloc, p, parent, bits, changeset);
1059 cache_state(prealloc, cached_state);
1060 prealloc = NULL;
1061 goto out;
1062 }
1063 state = rb_entry(node, struct extent_state, rb_node);
1064 hit_next:
1065 last_start = state->start;
1066 last_end = state->end;
1067
1068
1069
1070
1071
1072
1073
1074 if (state->start == start && state->end <= end) {
1075 if (state->state & exclusive_bits) {
1076 *failed_start = state->start;
1077 err = -EEXIST;
1078 goto out;
1079 }
1080
1081 set_state_bits(tree, state, bits, changeset);
1082 cache_state(state, cached_state);
1083 merge_state(tree, state);
1084 if (last_end == (u64)-1)
1085 goto out;
1086 start = last_end + 1;
1087 state = next_state(state);
1088 if (start < end && state && state->start == start &&
1089 !need_resched())
1090 goto hit_next;
1091 goto search_again;
1092 }
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110 if (state->start < start) {
1111 if (state->state & exclusive_bits) {
1112 *failed_start = start;
1113 err = -EEXIST;
1114 goto out;
1115 }
1116
1117
1118
1119
1120
1121 if ((state->state & bits) == bits) {
1122 start = state->end + 1;
1123 cache_state(state, cached_state);
1124 goto search_again;
1125 }
1126
1127 prealloc = alloc_extent_state_atomic(prealloc);
1128 BUG_ON(!prealloc);
1129 err = split_state(tree, state, prealloc, start);
1130 if (err)
1131 extent_io_tree_panic(tree, err);
1132
1133 prealloc = NULL;
1134 if (err)
1135 goto out;
1136 if (state->end <= end) {
1137 set_state_bits(tree, state, bits, changeset);
1138 cache_state(state, cached_state);
1139 merge_state(tree, state);
1140 if (last_end == (u64)-1)
1141 goto out;
1142 start = last_end + 1;
1143 state = next_state(state);
1144 if (start < end && state && state->start == start &&
1145 !need_resched())
1146 goto hit_next;
1147 }
1148 goto search_again;
1149 }
1150
1151
1152
1153
1154
1155
1156
1157 if (state->start > start) {
1158 u64 this_end;
1159 if (end < last_start)
1160 this_end = end;
1161 else
1162 this_end = last_start - 1;
1163
1164 prealloc = alloc_extent_state_atomic(prealloc);
1165 BUG_ON(!prealloc);
1166
1167
1168
1169
1170
1171 prealloc->start = start;
1172 prealloc->end = this_end;
1173 err = insert_state(tree, prealloc, bits, changeset);
1174 if (err)
1175 extent_io_tree_panic(tree, err);
1176
1177 cache_state(prealloc, cached_state);
1178 prealloc = NULL;
1179 start = this_end + 1;
1180 goto search_again;
1181 }
1182
1183
1184
1185
1186
1187
1188 if (state->start <= end && state->end > end) {
1189 if (state->state & exclusive_bits) {
1190 *failed_start = start;
1191 err = -EEXIST;
1192 goto out;
1193 }
1194
1195 prealloc = alloc_extent_state_atomic(prealloc);
1196 BUG_ON(!prealloc);
1197 err = split_state(tree, state, prealloc, end + 1);
1198 if (err)
1199 extent_io_tree_panic(tree, err);
1200
1201 set_state_bits(tree, prealloc, bits, changeset);
1202 cache_state(prealloc, cached_state);
1203 merge_state(tree, prealloc);
1204 prealloc = NULL;
1205 goto out;
1206 }
1207
1208 search_again:
1209 if (start > end)
1210 goto out;
1211 spin_unlock(&tree->lock);
1212 if (gfpflags_allow_blocking(mask))
1213 cond_resched();
1214 goto again;
1215
1216 out:
1217 spin_unlock(&tree->lock);
1218 if (prealloc)
1219 free_extent_state(prealloc);
1220
1221 return err;
1222
1223 }
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1244 u32 bits, u32 clear_bits,
1245 struct extent_state **cached_state)
1246 {
1247 struct extent_state *state;
1248 struct extent_state *prealloc = NULL;
1249 struct rb_node *node;
1250 struct rb_node **p;
1251 struct rb_node *parent;
1252 int err = 0;
1253 u64 last_start;
1254 u64 last_end;
1255 bool first_iteration = true;
1256
1257 btrfs_debug_check_extent_io_range(tree, start, end);
1258 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1259 clear_bits);
1260
1261 again:
1262 if (!prealloc) {
1263
1264
1265
1266
1267
1268
1269
1270 prealloc = alloc_extent_state(GFP_NOFS);
1271 if (!prealloc && !first_iteration)
1272 return -ENOMEM;
1273 }
1274
1275 spin_lock(&tree->lock);
1276 if (cached_state && *cached_state) {
1277 state = *cached_state;
1278 if (state->start <= start && state->end > start &&
1279 extent_state_in_tree(state)) {
1280 node = &state->rb_node;
1281 goto hit_next;
1282 }
1283 }
1284
1285
1286
1287
1288
1289 node = tree_search_for_insert(tree, start, &p, &parent);
1290 if (!node) {
1291 prealloc = alloc_extent_state_atomic(prealloc);
1292 if (!prealloc) {
1293 err = -ENOMEM;
1294 goto out;
1295 }
1296 prealloc->start = start;
1297 prealloc->end = end;
1298 insert_state_fast(tree, prealloc, p, parent, bits, NULL);
1299 cache_state(prealloc, cached_state);
1300 prealloc = NULL;
1301 goto out;
1302 }
1303 state = rb_entry(node, struct extent_state, rb_node);
1304 hit_next:
1305 last_start = state->start;
1306 last_end = state->end;
1307
1308
1309
1310
1311
1312
1313
1314 if (state->start == start && state->end <= end) {
1315 set_state_bits(tree, state, bits, NULL);
1316 cache_state(state, cached_state);
1317 state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1318 if (last_end == (u64)-1)
1319 goto out;
1320 start = last_end + 1;
1321 if (start < end && state && state->start == start &&
1322 !need_resched())
1323 goto hit_next;
1324 goto search_again;
1325 }
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343 if (state->start < start) {
1344 prealloc = alloc_extent_state_atomic(prealloc);
1345 if (!prealloc) {
1346 err = -ENOMEM;
1347 goto out;
1348 }
1349 err = split_state(tree, state, prealloc, start);
1350 if (err)
1351 extent_io_tree_panic(tree, err);
1352 prealloc = NULL;
1353 if (err)
1354 goto out;
1355 if (state->end <= end) {
1356 set_state_bits(tree, state, bits, NULL);
1357 cache_state(state, cached_state);
1358 state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1359 if (last_end == (u64)-1)
1360 goto out;
1361 start = last_end + 1;
1362 if (start < end && state && state->start == start &&
1363 !need_resched())
1364 goto hit_next;
1365 }
1366 goto search_again;
1367 }
1368
1369
1370
1371
1372
1373
1374
1375 if (state->start > start) {
1376 u64 this_end;
1377 if (end < last_start)
1378 this_end = end;
1379 else
1380 this_end = last_start - 1;
1381
1382 prealloc = alloc_extent_state_atomic(prealloc);
1383 if (!prealloc) {
1384 err = -ENOMEM;
1385 goto out;
1386 }
1387
1388
1389
1390
1391
1392 prealloc->start = start;
1393 prealloc->end = this_end;
1394 err = insert_state(tree, prealloc, bits, NULL);
1395 if (err)
1396 extent_io_tree_panic(tree, err);
1397 cache_state(prealloc, cached_state);
1398 prealloc = NULL;
1399 start = this_end + 1;
1400 goto search_again;
1401 }
1402
1403
1404
1405
1406
1407
1408 if (state->start <= end && state->end > end) {
1409 prealloc = alloc_extent_state_atomic(prealloc);
1410 if (!prealloc) {
1411 err = -ENOMEM;
1412 goto out;
1413 }
1414
1415 err = split_state(tree, state, prealloc, end + 1);
1416 if (err)
1417 extent_io_tree_panic(tree, err);
1418
1419 set_state_bits(tree, prealloc, bits, NULL);
1420 cache_state(prealloc, cached_state);
1421 clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
1422 prealloc = NULL;
1423 goto out;
1424 }
1425
1426 search_again:
1427 if (start > end)
1428 goto out;
1429 spin_unlock(&tree->lock);
1430 cond_resched();
1431 first_iteration = false;
1432 goto again;
1433
1434 out:
1435 spin_unlock(&tree->lock);
1436 if (prealloc)
1437 free_extent_state(prealloc);
1438
1439 return err;
1440 }
1441
1442
1443 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1444 u32 bits, struct extent_changeset *changeset)
1445 {
1446
1447
1448
1449
1450
1451
1452 BUG_ON(bits & EXTENT_LOCKED);
1453
1454 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1455 changeset);
1456 }
1457
1458 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1459 u32 bits)
1460 {
1461 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1462 GFP_NOWAIT, NULL);
1463 }
1464
1465 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1466 u32 bits, int wake, int delete,
1467 struct extent_state **cached)
1468 {
1469 return __clear_extent_bit(tree, start, end, bits, wake, delete,
1470 cached, GFP_NOFS, NULL);
1471 }
1472
1473 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1474 u32 bits, struct extent_changeset *changeset)
1475 {
1476
1477
1478
1479
1480 BUG_ON(bits & EXTENT_LOCKED);
1481
1482 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1483 changeset);
1484 }
1485
1486
1487
1488
1489
1490 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1491 struct extent_state **cached_state)
1492 {
1493 int err;
1494 u64 failed_start;
1495
1496 while (1) {
1497 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1498 EXTENT_LOCKED, &failed_start,
1499 cached_state, GFP_NOFS, NULL);
1500 if (err == -EEXIST) {
1501 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1502 start = failed_start;
1503 } else
1504 break;
1505 WARN_ON(start > end);
1506 }
1507 return err;
1508 }
1509
1510 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1511 {
1512 int err;
1513 u64 failed_start;
1514
1515 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1516 &failed_start, NULL, GFP_NOFS, NULL);
1517 if (err == -EEXIST) {
1518 if (failed_start > start)
1519 clear_extent_bit(tree, start, failed_start - 1,
1520 EXTENT_LOCKED, 1, 0, NULL);
1521 return 0;
1522 }
1523 return 1;
1524 }
1525
1526 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1527 {
1528 unsigned long index = start >> PAGE_SHIFT;
1529 unsigned long end_index = end >> PAGE_SHIFT;
1530 struct page *page;
1531
1532 while (index <= end_index) {
1533 page = find_get_page(inode->i_mapping, index);
1534 BUG_ON(!page);
1535 clear_page_dirty_for_io(page);
1536 put_page(page);
1537 index++;
1538 }
1539 }
1540
1541 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1542 {
1543 struct address_space *mapping = inode->i_mapping;
1544 unsigned long index = start >> PAGE_SHIFT;
1545 unsigned long end_index = end >> PAGE_SHIFT;
1546 struct folio *folio;
1547
1548 while (index <= end_index) {
1549 folio = filemap_get_folio(mapping, index);
1550 filemap_dirty_folio(mapping, folio);
1551 folio_account_redirty(folio);
1552 index += folio_nr_pages(folio);
1553 folio_put(folio);
1554 }
1555 }
1556
1557
1558
1559
1560
1561 static struct extent_state *
1562 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
1563 {
1564 struct rb_node *node;
1565 struct extent_state *state;
1566
1567
1568
1569
1570
1571 node = tree_search(tree, start);
1572 if (!node)
1573 goto out;
1574
1575 while (1) {
1576 state = rb_entry(node, struct extent_state, rb_node);
1577 if (state->end >= start && (state->state & bits))
1578 return state;
1579
1580 node = rb_next(node);
1581 if (!node)
1582 break;
1583 }
1584 out:
1585 return NULL;
1586 }
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1597 u64 *start_ret, u64 *end_ret, u32 bits,
1598 struct extent_state **cached_state)
1599 {
1600 struct extent_state *state;
1601 int ret = 1;
1602
1603 spin_lock(&tree->lock);
1604 if (cached_state && *cached_state) {
1605 state = *cached_state;
1606 if (state->end == start - 1 && extent_state_in_tree(state)) {
1607 while ((state = next_state(state)) != NULL) {
1608 if (state->state & bits)
1609 goto got_it;
1610 }
1611 free_extent_state(*cached_state);
1612 *cached_state = NULL;
1613 goto out;
1614 }
1615 free_extent_state(*cached_state);
1616 *cached_state = NULL;
1617 }
1618
1619 state = find_first_extent_bit_state(tree, start, bits);
1620 got_it:
1621 if (state) {
1622 cache_state_if_flags(state, cached_state, 0);
1623 *start_ret = state->start;
1624 *end_ret = state->end;
1625 ret = 0;
1626 }
1627 out:
1628 spin_unlock(&tree->lock);
1629 return ret;
1630 }
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1649 u64 *start_ret, u64 *end_ret, u32 bits)
1650 {
1651 struct extent_state *state;
1652 int ret = 1;
1653
1654 spin_lock(&tree->lock);
1655 state = find_first_extent_bit_state(tree, start, bits);
1656 if (state) {
1657 *start_ret = state->start;
1658 *end_ret = state->end;
1659 while ((state = next_state(state)) != NULL) {
1660 if (state->start > (*end_ret + 1))
1661 break;
1662 *end_ret = state->end;
1663 }
1664 ret = 0;
1665 }
1666 spin_unlock(&tree->lock);
1667 return ret;
1668 }
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1686 u64 *start_ret, u64 *end_ret, u32 bits)
1687 {
1688 struct extent_state *state;
1689 struct rb_node *node, *prev = NULL, *next;
1690
1691 spin_lock(&tree->lock);
1692
1693
1694 while (1) {
1695 node = tree_search_prev_next(tree, start, &prev, &next);
1696 if (!node && !next && !prev) {
1697
1698
1699
1700
1701 *start_ret = 0;
1702 *end_ret = -1;
1703 goto out;
1704 } else if (!node && !next) {
1705
1706
1707
1708
1709 state = rb_entry(prev, struct extent_state, rb_node);
1710 *start_ret = state->end + 1;
1711 *end_ret = -1;
1712 goto out;
1713 } else if (!node) {
1714 node = next;
1715 }
1716
1717
1718
1719
1720 state = rb_entry(node, struct extent_state, rb_node);
1721
1722 if (in_range(start, state->start, state->end - state->start + 1)) {
1723 if (state->state & bits) {
1724
1725
1726
1727
1728
1729 start = state->end + 1;
1730 } else {
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740 *start_ret = state->start;
1741 break;
1742 }
1743 } else {
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755 if (prev) {
1756 state = rb_entry(prev, struct extent_state,
1757 rb_node);
1758 *start_ret = state->end + 1;
1759 } else {
1760 *start_ret = 0;
1761 }
1762 break;
1763 }
1764 }
1765
1766
1767
1768
1769
1770 while (1) {
1771 state = rb_entry(node, struct extent_state, rb_node);
1772 if (state->end >= start && !(state->state & bits)) {
1773 *end_ret = state->end;
1774 } else {
1775 *end_ret = state->start - 1;
1776 break;
1777 }
1778
1779 node = rb_next(node);
1780 if (!node)
1781 break;
1782 }
1783 out:
1784 spin_unlock(&tree->lock);
1785 }
1786
1787
1788
1789
1790
1791
1792
1793 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1794 u64 *end, u64 max_bytes,
1795 struct extent_state **cached_state)
1796 {
1797 struct rb_node *node;
1798 struct extent_state *state;
1799 u64 cur_start = *start;
1800 bool found = false;
1801 u64 total_bytes = 0;
1802
1803 spin_lock(&tree->lock);
1804
1805
1806
1807
1808
1809 node = tree_search(tree, cur_start);
1810 if (!node) {
1811 *end = (u64)-1;
1812 goto out;
1813 }
1814
1815 while (1) {
1816 state = rb_entry(node, struct extent_state, rb_node);
1817 if (found && (state->start != cur_start ||
1818 (state->state & EXTENT_BOUNDARY))) {
1819 goto out;
1820 }
1821 if (!(state->state & EXTENT_DELALLOC)) {
1822 if (!found)
1823 *end = state->end;
1824 goto out;
1825 }
1826 if (!found) {
1827 *start = state->start;
1828 *cached_state = state;
1829 refcount_inc(&state->refs);
1830 }
1831 found = true;
1832 *end = state->end;
1833 cur_start = state->end + 1;
1834 node = rb_next(node);
1835 total_bytes += state->end - state->start + 1;
1836 if (total_bytes >= max_bytes)
1837 break;
1838 if (!node)
1839 break;
1840 }
1841 out:
1842 spin_unlock(&tree->lock);
1843 return found;
1844 }
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854 static int process_one_page(struct btrfs_fs_info *fs_info,
1855 struct address_space *mapping,
1856 struct page *page, struct page *locked_page,
1857 unsigned long page_ops, u64 start, u64 end)
1858 {
1859 u32 len;
1860
1861 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
1862 len = end + 1 - start;
1863
1864 if (page_ops & PAGE_SET_ORDERED)
1865 btrfs_page_clamp_set_ordered(fs_info, page, start, len);
1866 if (page_ops & PAGE_SET_ERROR)
1867 btrfs_page_clamp_set_error(fs_info, page, start, len);
1868 if (page_ops & PAGE_START_WRITEBACK) {
1869 btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
1870 btrfs_page_clamp_set_writeback(fs_info, page, start, len);
1871 }
1872 if (page_ops & PAGE_END_WRITEBACK)
1873 btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
1874
1875 if (page == locked_page)
1876 return 1;
1877
1878 if (page_ops & PAGE_LOCK) {
1879 int ret;
1880
1881 ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
1882 if (ret)
1883 return ret;
1884 if (!PageDirty(page) || page->mapping != mapping) {
1885 btrfs_page_end_writer_lock(fs_info, page, start, len);
1886 return -EAGAIN;
1887 }
1888 }
1889 if (page_ops & PAGE_UNLOCK)
1890 btrfs_page_end_writer_lock(fs_info, page, start, len);
1891 return 0;
1892 }
1893
1894 static int __process_pages_contig(struct address_space *mapping,
1895 struct page *locked_page,
1896 u64 start, u64 end, unsigned long page_ops,
1897 u64 *processed_end)
1898 {
1899 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
1900 pgoff_t start_index = start >> PAGE_SHIFT;
1901 pgoff_t end_index = end >> PAGE_SHIFT;
1902 pgoff_t index = start_index;
1903 unsigned long nr_pages = end_index - start_index + 1;
1904 unsigned long pages_processed = 0;
1905 struct page *pages[16];
1906 int err = 0;
1907 int i;
1908
1909 if (page_ops & PAGE_LOCK) {
1910 ASSERT(page_ops == PAGE_LOCK);
1911 ASSERT(processed_end && *processed_end == start);
1912 }
1913
1914 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1915 mapping_set_error(mapping, -EIO);
1916
1917 while (nr_pages > 0) {
1918 int found_pages;
1919
1920 found_pages = find_get_pages_contig(mapping, index,
1921 min_t(unsigned long,
1922 nr_pages, ARRAY_SIZE(pages)), pages);
1923 if (found_pages == 0) {
1924
1925
1926
1927
1928 ASSERT(page_ops & PAGE_LOCK);
1929 err = -EAGAIN;
1930 goto out;
1931 }
1932
1933 for (i = 0; i < found_pages; i++) {
1934 int process_ret;
1935
1936 process_ret = process_one_page(fs_info, mapping,
1937 pages[i], locked_page, page_ops,
1938 start, end);
1939 if (process_ret < 0) {
1940 for (; i < found_pages; i++)
1941 put_page(pages[i]);
1942 err = -EAGAIN;
1943 goto out;
1944 }
1945 put_page(pages[i]);
1946 pages_processed++;
1947 }
1948 nr_pages -= found_pages;
1949 index += found_pages;
1950 cond_resched();
1951 }
1952 out:
1953 if (err && processed_end) {
1954
1955
1956
1957
1958
1959
1960
1961
1962 if (pages_processed)
1963 *processed_end = min(end,
1964 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
1965 else
1966 *processed_end = start;
1967 }
1968 return err;
1969 }
1970
1971 static noinline void __unlock_for_delalloc(struct inode *inode,
1972 struct page *locked_page,
1973 u64 start, u64 end)
1974 {
1975 unsigned long index = start >> PAGE_SHIFT;
1976 unsigned long end_index = end >> PAGE_SHIFT;
1977
1978 ASSERT(locked_page);
1979 if (index == locked_page->index && end_index == index)
1980 return;
1981
1982 __process_pages_contig(inode->i_mapping, locked_page, start, end,
1983 PAGE_UNLOCK, NULL);
1984 }
1985
1986 static noinline int lock_delalloc_pages(struct inode *inode,
1987 struct page *locked_page,
1988 u64 delalloc_start,
1989 u64 delalloc_end)
1990 {
1991 unsigned long index = delalloc_start >> PAGE_SHIFT;
1992 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1993 u64 processed_end = delalloc_start;
1994 int ret;
1995
1996 ASSERT(locked_page);
1997 if (index == locked_page->index && index == end_index)
1998 return 0;
1999
2000 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
2001 delalloc_end, PAGE_LOCK, &processed_end);
2002 if (ret == -EAGAIN && processed_end > delalloc_start)
2003 __unlock_for_delalloc(inode, locked_page, delalloc_start,
2004 processed_end);
2005 return ret;
2006 }
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023 EXPORT_FOR_TESTS
2024 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
2025 struct page *locked_page, u64 *start,
2026 u64 *end)
2027 {
2028 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2029 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2030 const u64 orig_start = *start;
2031 const u64 orig_end = *end;
2032
2033 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
2034 u64 delalloc_start;
2035 u64 delalloc_end;
2036 bool found;
2037 struct extent_state *cached_state = NULL;
2038 int ret;
2039 int loops = 0;
2040
2041
2042 ASSERT(orig_end > orig_start);
2043
2044
2045 ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
2046 orig_end <= page_offset(locked_page)));
2047 again:
2048
2049 delalloc_start = *start;
2050 delalloc_end = 0;
2051 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
2052 max_bytes, &cached_state);
2053 if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
2054 *start = delalloc_start;
2055
2056
2057 *end = min(delalloc_end, orig_end);
2058 free_extent_state(cached_state);
2059 return false;
2060 }
2061
2062
2063
2064
2065
2066
2067 if (delalloc_start < *start)
2068 delalloc_start = *start;
2069
2070
2071
2072
2073 if (delalloc_end + 1 - delalloc_start > max_bytes)
2074 delalloc_end = delalloc_start + max_bytes - 1;
2075
2076
2077 ret = lock_delalloc_pages(inode, locked_page,
2078 delalloc_start, delalloc_end);
2079 ASSERT(!ret || ret == -EAGAIN);
2080 if (ret == -EAGAIN) {
2081
2082
2083
2084 free_extent_state(cached_state);
2085 cached_state = NULL;
2086 if (!loops) {
2087 max_bytes = PAGE_SIZE;
2088 loops = 1;
2089 goto again;
2090 } else {
2091 found = false;
2092 goto out_failed;
2093 }
2094 }
2095
2096
2097 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
2098
2099
2100 ret = test_range_bit(tree, delalloc_start, delalloc_end,
2101 EXTENT_DELALLOC, 1, cached_state);
2102 if (!ret) {
2103 unlock_extent_cached(tree, delalloc_start, delalloc_end,
2104 &cached_state);
2105 __unlock_for_delalloc(inode, locked_page,
2106 delalloc_start, delalloc_end);
2107 cond_resched();
2108 goto again;
2109 }
2110 free_extent_state(cached_state);
2111 *start = delalloc_start;
2112 *end = delalloc_end;
2113 out_failed:
2114 return found;
2115 }
2116
2117 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2118 struct page *locked_page,
2119 u32 clear_bits, unsigned long page_ops)
2120 {
2121 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
2122
2123 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
2124 start, end, page_ops, NULL);
2125 }
2126
2127
2128
2129
2130
2131
2132 u64 count_range_bits(struct extent_io_tree *tree,
2133 u64 *start, u64 search_end, u64 max_bytes,
2134 u32 bits, int contig)
2135 {
2136 struct rb_node *node;
2137 struct extent_state *state;
2138 u64 cur_start = *start;
2139 u64 total_bytes = 0;
2140 u64 last = 0;
2141 int found = 0;
2142
2143 if (WARN_ON(search_end <= cur_start))
2144 return 0;
2145
2146 spin_lock(&tree->lock);
2147 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2148 total_bytes = tree->dirty_bytes;
2149 goto out;
2150 }
2151
2152
2153
2154
2155 node = tree_search(tree, cur_start);
2156 if (!node)
2157 goto out;
2158
2159 while (1) {
2160 state = rb_entry(node, struct extent_state, rb_node);
2161 if (state->start > search_end)
2162 break;
2163 if (contig && found && state->start > last + 1)
2164 break;
2165 if (state->end >= cur_start && (state->state & bits) == bits) {
2166 total_bytes += min(search_end, state->end) + 1 -
2167 max(cur_start, state->start);
2168 if (total_bytes >= max_bytes)
2169 break;
2170 if (!found) {
2171 *start = max(cur_start, state->start);
2172 found = 1;
2173 }
2174 last = state->end;
2175 } else if (contig && found) {
2176 break;
2177 }
2178 node = rb_next(node);
2179 if (!node)
2180 break;
2181 }
2182 out:
2183 spin_unlock(&tree->lock);
2184 return total_bytes;
2185 }
2186
2187
2188
2189
2190
2191 int set_state_failrec(struct extent_io_tree *tree, u64 start,
2192 struct io_failure_record *failrec)
2193 {
2194 struct rb_node *node;
2195 struct extent_state *state;
2196 int ret = 0;
2197
2198 spin_lock(&tree->lock);
2199
2200
2201
2202
2203 node = tree_search(tree, start);
2204 if (!node) {
2205 ret = -ENOENT;
2206 goto out;
2207 }
2208 state = rb_entry(node, struct extent_state, rb_node);
2209 if (state->start != start) {
2210 ret = -ENOENT;
2211 goto out;
2212 }
2213 state->failrec = failrec;
2214 out:
2215 spin_unlock(&tree->lock);
2216 return ret;
2217 }
2218
2219 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
2220 {
2221 struct rb_node *node;
2222 struct extent_state *state;
2223 struct io_failure_record *failrec;
2224
2225 spin_lock(&tree->lock);
2226
2227
2228
2229
2230 node = tree_search(tree, start);
2231 if (!node) {
2232 failrec = ERR_PTR(-ENOENT);
2233 goto out;
2234 }
2235 state = rb_entry(node, struct extent_state, rb_node);
2236 if (state->start != start) {
2237 failrec = ERR_PTR(-ENOENT);
2238 goto out;
2239 }
2240
2241 failrec = state->failrec;
2242 out:
2243 spin_unlock(&tree->lock);
2244 return failrec;
2245 }
2246
2247
2248
2249
2250
2251
2252
2253 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2254 u32 bits, int filled, struct extent_state *cached)
2255 {
2256 struct extent_state *state = NULL;
2257 struct rb_node *node;
2258 int bitset = 0;
2259
2260 spin_lock(&tree->lock);
2261 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2262 cached->end > start)
2263 node = &cached->rb_node;
2264 else
2265 node = tree_search(tree, start);
2266 while (node && start <= end) {
2267 state = rb_entry(node, struct extent_state, rb_node);
2268
2269 if (filled && state->start > start) {
2270 bitset = 0;
2271 break;
2272 }
2273
2274 if (state->start > end)
2275 break;
2276
2277 if (state->state & bits) {
2278 bitset = 1;
2279 if (!filled)
2280 break;
2281 } else if (filled) {
2282 bitset = 0;
2283 break;
2284 }
2285
2286 if (state->end == (u64)-1)
2287 break;
2288
2289 start = state->end + 1;
2290 if (start > end)
2291 break;
2292 node = rb_next(node);
2293 if (!node) {
2294 if (filled)
2295 bitset = 0;
2296 break;
2297 }
2298 }
2299 spin_unlock(&tree->lock);
2300 return bitset;
2301 }
2302
2303 int free_io_failure(struct extent_io_tree *failure_tree,
2304 struct extent_io_tree *io_tree,
2305 struct io_failure_record *rec)
2306 {
2307 int ret;
2308 int err = 0;
2309
2310 set_state_failrec(failure_tree, rec->start, NULL);
2311 ret = clear_extent_bits(failure_tree, rec->start,
2312 rec->start + rec->len - 1,
2313 EXTENT_LOCKED | EXTENT_DIRTY);
2314 if (ret)
2315 err = ret;
2316
2317 ret = clear_extent_bits(io_tree, rec->start,
2318 rec->start + rec->len - 1,
2319 EXTENT_DAMAGED);
2320 if (ret && !err)
2321 err = ret;
2322
2323 kfree(rec);
2324 return err;
2325 }
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337 static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2338 u64 length, u64 logical, struct page *page,
2339 unsigned int pg_offset, int mirror_num)
2340 {
2341 struct btrfs_device *dev;
2342 struct bio_vec bvec;
2343 struct bio bio;
2344 u64 map_length = 0;
2345 u64 sector;
2346 struct btrfs_io_context *bioc = NULL;
2347 int ret = 0;
2348
2349 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2350 BUG_ON(!mirror_num);
2351
2352 if (btrfs_repair_one_zone(fs_info, logical))
2353 return 0;
2354
2355 map_length = length;
2356
2357
2358
2359
2360
2361
2362 btrfs_bio_counter_inc_blocked(fs_info);
2363 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2364
2365
2366
2367
2368
2369
2370 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2371 &map_length, &bioc, 0);
2372 if (ret)
2373 goto out_counter_dec;
2374 ASSERT(bioc->mirror_num == 1);
2375 } else {
2376 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2377 &map_length, &bioc, mirror_num);
2378 if (ret)
2379 goto out_counter_dec;
2380 BUG_ON(mirror_num != bioc->mirror_num);
2381 }
2382
2383 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
2384 dev = bioc->stripes[bioc->mirror_num - 1].dev;
2385 btrfs_put_bioc(bioc);
2386
2387 if (!dev || !dev->bdev ||
2388 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2389 ret = -EIO;
2390 goto out_counter_dec;
2391 }
2392
2393 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
2394 bio.bi_iter.bi_sector = sector;
2395 __bio_add_page(&bio, page, length, pg_offset);
2396
2397 btrfsic_check_bio(&bio);
2398 ret = submit_bio_wait(&bio);
2399 if (ret) {
2400
2401 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2402 goto out_bio_uninit;
2403 }
2404
2405 btrfs_info_rl_in_rcu(fs_info,
2406 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2407 ino, start,
2408 rcu_str_deref(dev->name), sector);
2409 ret = 0;
2410
2411 out_bio_uninit:
2412 bio_uninit(&bio);
2413 out_counter_dec:
2414 btrfs_bio_counter_dec(fs_info);
2415 return ret;
2416 }
2417
2418 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
2419 {
2420 struct btrfs_fs_info *fs_info = eb->fs_info;
2421 u64 start = eb->start;
2422 int i, num_pages = num_extent_pages(eb);
2423 int ret = 0;
2424
2425 if (sb_rdonly(fs_info->sb))
2426 return -EROFS;
2427
2428 for (i = 0; i < num_pages; i++) {
2429 struct page *p = eb->pages[i];
2430
2431 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2432 start - page_offset(p), mirror_num);
2433 if (ret)
2434 break;
2435 start += PAGE_SIZE;
2436 }
2437
2438 return ret;
2439 }
2440
2441 static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
2442 {
2443 if (cur_mirror == failrec->num_copies)
2444 return cur_mirror + 1 - failrec->num_copies;
2445 return cur_mirror + 1;
2446 }
2447
2448 static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
2449 {
2450 if (cur_mirror == 1)
2451 return failrec->num_copies;
2452 return cur_mirror - 1;
2453 }
2454
2455
2456
2457
2458
2459 int clean_io_failure(struct btrfs_fs_info *fs_info,
2460 struct extent_io_tree *failure_tree,
2461 struct extent_io_tree *io_tree, u64 start,
2462 struct page *page, u64 ino, unsigned int pg_offset)
2463 {
2464 u64 private;
2465 struct io_failure_record *failrec;
2466 struct extent_state *state;
2467 int mirror;
2468 int ret;
2469
2470 private = 0;
2471 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2472 EXTENT_DIRTY, 0);
2473 if (!ret)
2474 return 0;
2475
2476 failrec = get_state_failrec(failure_tree, start);
2477 if (IS_ERR(failrec))
2478 return 0;
2479
2480 BUG_ON(!failrec->this_mirror);
2481
2482 if (sb_rdonly(fs_info->sb))
2483 goto out;
2484
2485 spin_lock(&io_tree->lock);
2486 state = find_first_extent_bit_state(io_tree,
2487 failrec->start,
2488 EXTENT_LOCKED);
2489 spin_unlock(&io_tree->lock);
2490
2491 if (!state || state->start > failrec->start ||
2492 state->end < failrec->start + failrec->len - 1)
2493 goto out;
2494
2495 mirror = failrec->this_mirror;
2496 do {
2497 mirror = prev_mirror(failrec, mirror);
2498 repair_io_failure(fs_info, ino, start, failrec->len,
2499 failrec->logical, page, pg_offset, mirror);
2500 } while (mirror != failrec->failed_mirror);
2501
2502 out:
2503 free_io_failure(failure_tree, io_tree, failrec);
2504 return 0;
2505 }
2506
2507
2508
2509
2510
2511
2512
2513 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2514 {
2515 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2516 struct io_failure_record *failrec;
2517 struct extent_state *state, *next;
2518
2519 if (RB_EMPTY_ROOT(&failure_tree->state))
2520 return;
2521
2522 spin_lock(&failure_tree->lock);
2523 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2524 while (state) {
2525 if (state->start > end)
2526 break;
2527
2528 ASSERT(state->end <= end);
2529
2530 next = next_state(state);
2531
2532 failrec = state->failrec;
2533 free_extent_state(state);
2534 kfree(failrec);
2535
2536 state = next;
2537 }
2538 spin_unlock(&failure_tree->lock);
2539 }
2540
2541 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2542 struct btrfs_bio *bbio,
2543 unsigned int bio_offset)
2544 {
2545 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2546 u64 start = bbio->file_offset + bio_offset;
2547 struct io_failure_record *failrec;
2548 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2549 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2550 const u32 sectorsize = fs_info->sectorsize;
2551 int ret;
2552
2553 failrec = get_state_failrec(failure_tree, start);
2554 if (!IS_ERR(failrec)) {
2555 btrfs_debug(fs_info,
2556 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2557 failrec->logical, failrec->start, failrec->len);
2558
2559
2560
2561
2562
2563 ASSERT(failrec->this_mirror == bbio->mirror_num);
2564 ASSERT(failrec->len == fs_info->sectorsize);
2565 return failrec;
2566 }
2567
2568 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2569 if (!failrec)
2570 return ERR_PTR(-ENOMEM);
2571
2572 failrec->start = start;
2573 failrec->len = sectorsize;
2574 failrec->failed_mirror = bbio->mirror_num;
2575 failrec->this_mirror = bbio->mirror_num;
2576 failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
2577
2578 btrfs_debug(fs_info,
2579 "new io failure record logical %llu start %llu",
2580 failrec->logical, start);
2581
2582 failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
2583 if (failrec->num_copies == 1) {
2584
2585
2586
2587
2588
2589 btrfs_debug(fs_info,
2590 "cannot repair logical %llu num_copies %d",
2591 failrec->logical, failrec->num_copies);
2592 kfree(failrec);
2593 return ERR_PTR(-EIO);
2594 }
2595
2596
2597 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
2598 EXTENT_LOCKED | EXTENT_DIRTY);
2599 if (ret >= 0) {
2600 ret = set_state_failrec(failure_tree, start, failrec);
2601
2602 ret = set_extent_bits(tree, start, start + sectorsize - 1,
2603 EXTENT_DAMAGED);
2604 } else if (ret < 0) {
2605 kfree(failrec);
2606 return ERR_PTR(ret);
2607 }
2608
2609 return failrec;
2610 }
2611
2612 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
2613 u32 bio_offset, struct page *page, unsigned int pgoff,
2614 submit_bio_hook_t *submit_bio_hook)
2615 {
2616 u64 start = failed_bbio->file_offset + bio_offset;
2617 struct io_failure_record *failrec;
2618 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2619 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2620 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2621 struct bio *failed_bio = &failed_bbio->bio;
2622 const int icsum = bio_offset >> fs_info->sectorsize_bits;
2623 struct bio *repair_bio;
2624 struct btrfs_bio *repair_bbio;
2625
2626 btrfs_debug(fs_info,
2627 "repair read error: read error at %llu", start);
2628
2629 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2630
2631 failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
2632 if (IS_ERR(failrec))
2633 return PTR_ERR(failrec);
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644 failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
2645 if (failrec->this_mirror == failrec->failed_mirror) {
2646 btrfs_debug(fs_info,
2647 "failed to repair num_copies %d this_mirror %d failed_mirror %d",
2648 failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
2649 free_io_failure(failure_tree, tree, failrec);
2650 return -EIO;
2651 }
2652
2653 repair_bio = btrfs_bio_alloc(1);
2654 repair_bbio = btrfs_bio(repair_bio);
2655 repair_bbio->file_offset = start;
2656 repair_bio->bi_opf = REQ_OP_READ;
2657 repair_bio->bi_end_io = failed_bio->bi_end_io;
2658 repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2659 repair_bio->bi_private = failed_bio->bi_private;
2660
2661 if (failed_bbio->csum) {
2662 const u32 csum_size = fs_info->csum_size;
2663
2664 repair_bbio->csum = repair_bbio->csum_inline;
2665 memcpy(repair_bbio->csum,
2666 failed_bbio->csum + csum_size * icsum, csum_size);
2667 }
2668
2669 bio_add_page(repair_bio, page, failrec->len, pgoff);
2670 repair_bbio->iter = repair_bio->bi_iter;
2671
2672 btrfs_debug(btrfs_sb(inode->i_sb),
2673 "repair read error: submitting new read to mirror %d",
2674 failrec->this_mirror);
2675
2676
2677
2678
2679
2680
2681 submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
2682 return BLK_STS_OK;
2683 }
2684
2685 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2686 {
2687 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2688
2689 ASSERT(page_offset(page) <= start &&
2690 start + len <= page_offset(page) + PAGE_SIZE);
2691
2692 if (uptodate) {
2693 if (fsverity_active(page->mapping->host) &&
2694 !PageError(page) &&
2695 !PageUptodate(page) &&
2696 start < i_size_read(page->mapping->host) &&
2697 !fsverity_verify_page(page)) {
2698 btrfs_page_set_error(fs_info, page, start, len);
2699 } else {
2700 btrfs_page_set_uptodate(fs_info, page, start, len);
2701 }
2702 } else {
2703 btrfs_page_clear_uptodate(fs_info, page, start, len);
2704 btrfs_page_set_error(fs_info, page, start, len);
2705 }
2706
2707 if (!btrfs_is_subpage(fs_info, page))
2708 unlock_page(page);
2709 else
2710 btrfs_subpage_end_reader(fs_info, page, start, len);
2711 }
2712
2713 static void end_sector_io(struct page *page, u64 offset, bool uptodate)
2714 {
2715 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
2716 const u32 sectorsize = inode->root->fs_info->sectorsize;
2717 struct extent_state *cached = NULL;
2718
2719 end_page_read(page, uptodate, offset, sectorsize);
2720 if (uptodate)
2721 set_extent_uptodate(&inode->io_tree, offset,
2722 offset + sectorsize - 1, &cached, GFP_ATOMIC);
2723 unlock_extent_cached_atomic(&inode->io_tree, offset,
2724 offset + sectorsize - 1, &cached);
2725 }
2726
2727 static void submit_data_read_repair(struct inode *inode,
2728 struct btrfs_bio *failed_bbio,
2729 u32 bio_offset, const struct bio_vec *bvec,
2730 unsigned int error_bitmap)
2731 {
2732 const unsigned int pgoff = bvec->bv_offset;
2733 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2734 struct page *page = bvec->bv_page;
2735 const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
2736 const u64 end = start + bvec->bv_len - 1;
2737 const u32 sectorsize = fs_info->sectorsize;
2738 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2739 int i;
2740
2741 BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
2742
2743
2744 ASSERT(is_data_inode(inode));
2745
2746
2747 ASSERT(error_bitmap);
2748
2749
2750
2751
2752
2753 ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
2754
2755
2756 for (i = 0; i < nr_bits; i++) {
2757 const unsigned int offset = i * sectorsize;
2758 bool uptodate = false;
2759 int ret;
2760
2761 if (!(error_bitmap & (1U << i))) {
2762
2763
2764
2765
2766 uptodate = true;
2767 goto next;
2768 }
2769
2770 ret = btrfs_repair_one_sector(inode, failed_bbio,
2771 bio_offset + offset, page, pgoff + offset,
2772 btrfs_submit_data_read_bio);
2773 if (!ret) {
2774
2775
2776
2777
2778
2779
2780 continue;
2781 }
2782
2783
2784
2785
2786 next:
2787 end_sector_io(page, start + offset, uptodate);
2788 }
2789 }
2790
2791
2792
2793 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2794 {
2795 struct btrfs_inode *inode;
2796 const bool uptodate = (err == 0);
2797 int ret = 0;
2798
2799 ASSERT(page && page->mapping);
2800 inode = BTRFS_I(page->mapping->host);
2801 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
2802
2803 if (!uptodate) {
2804 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
2805 u32 len;
2806
2807 ASSERT(end + 1 - start <= U32_MAX);
2808 len = end + 1 - start;
2809
2810 btrfs_page_clear_uptodate(fs_info, page, start, len);
2811 btrfs_page_set_error(fs_info, page, start, len);
2812 ret = err < 0 ? err : -EIO;
2813 mapping_set_error(page->mapping, ret);
2814 }
2815 }
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826 static void end_bio_extent_writepage(struct bio *bio)
2827 {
2828 int error = blk_status_to_errno(bio->bi_status);
2829 struct bio_vec *bvec;
2830 u64 start;
2831 u64 end;
2832 struct bvec_iter_all iter_all;
2833 bool first_bvec = true;
2834
2835 ASSERT(!bio_flagged(bio, BIO_CLONED));
2836 bio_for_each_segment_all(bvec, bio, iter_all) {
2837 struct page *page = bvec->bv_page;
2838 struct inode *inode = page->mapping->host;
2839 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2840 const u32 sectorsize = fs_info->sectorsize;
2841
2842
2843 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2844 btrfs_err(fs_info,
2845 "partial page write in btrfs with offset %u and length %u",
2846 bvec->bv_offset, bvec->bv_len);
2847 else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
2848 btrfs_info(fs_info,
2849 "incomplete page write with offset %u and length %u",
2850 bvec->bv_offset, bvec->bv_len);
2851
2852 start = page_offset(page) + bvec->bv_offset;
2853 end = start + bvec->bv_len - 1;
2854
2855 if (first_bvec) {
2856 btrfs_record_physical_zoned(inode, start, bio);
2857 first_bvec = false;
2858 }
2859
2860 end_extent_writepage(page, error, start, end);
2861
2862 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
2863 }
2864
2865 bio_put(bio);
2866 }
2867
2868
2869
2870
2871
2872
2873
2874 struct processed_extent {
2875 struct btrfs_inode *inode;
2876
2877 u64 start;
2878
2879 u64 end;
2880 bool uptodate;
2881 };
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894 static void endio_readpage_release_extent(struct processed_extent *processed,
2895 struct btrfs_inode *inode, u64 start, u64 end,
2896 bool uptodate)
2897 {
2898 struct extent_state *cached = NULL;
2899 struct extent_io_tree *tree;
2900
2901
2902 if (!processed->inode)
2903 goto update;
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916 if (processed->inode == inode && processed->uptodate == uptodate &&
2917 processed->end + 1 >= start && end >= processed->end) {
2918 processed->end = end;
2919 return;
2920 }
2921
2922 tree = &processed->inode->io_tree;
2923
2924
2925
2926
2927 if (processed->uptodate && tree->track_uptodate)
2928 set_extent_uptodate(tree, processed->start, processed->end,
2929 &cached, GFP_ATOMIC);
2930 unlock_extent_cached_atomic(tree, processed->start, processed->end,
2931 &cached);
2932
2933 update:
2934
2935 processed->inode = inode;
2936 processed->start = start;
2937 processed->end = end;
2938 processed->uptodate = uptodate;
2939 }
2940
2941 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2942 {
2943 ASSERT(PageLocked(page));
2944 if (!btrfs_is_subpage(fs_info, page))
2945 return;
2946
2947 ASSERT(PagePrivate(page));
2948 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2949 }
2950
2951
2952
2953
2954
2955
2956
2957 static struct extent_buffer *find_extent_buffer_readpage(
2958 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2959 {
2960 struct extent_buffer *eb;
2961
2962
2963
2964
2965
2966 if (fs_info->nodesize >= PAGE_SIZE) {
2967 ASSERT(PagePrivate(page) && page->private);
2968 return (struct extent_buffer *)page->private;
2969 }
2970
2971
2972 rcu_read_lock();
2973 eb = radix_tree_lookup(&fs_info->buffer_radix,
2974 bytenr >> fs_info->sectorsize_bits);
2975 rcu_read_unlock();
2976 ASSERT(eb);
2977 return eb;
2978 }
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991 static void end_bio_extent_readpage(struct bio *bio)
2992 {
2993 struct bio_vec *bvec;
2994 struct btrfs_bio *bbio = btrfs_bio(bio);
2995 struct extent_io_tree *tree, *failure_tree;
2996 struct processed_extent processed = { 0 };
2997
2998
2999
3000
3001 u32 bio_offset = 0;
3002 int mirror;
3003 struct bvec_iter_all iter_all;
3004
3005 ASSERT(!bio_flagged(bio, BIO_CLONED));
3006 bio_for_each_segment_all(bvec, bio, iter_all) {
3007 bool uptodate = !bio->bi_status;
3008 struct page *page = bvec->bv_page;
3009 struct inode *inode = page->mapping->host;
3010 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3011 const u32 sectorsize = fs_info->sectorsize;
3012 unsigned int error_bitmap = (unsigned int)-1;
3013 bool repair = false;
3014 u64 start;
3015 u64 end;
3016 u32 len;
3017
3018 btrfs_debug(fs_info,
3019 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
3020 bio->bi_iter.bi_sector, bio->bi_status,
3021 bbio->mirror_num);
3022 tree = &BTRFS_I(inode)->io_tree;
3023 failure_tree = &BTRFS_I(inode)->io_failure_tree;
3024
3025
3026
3027
3028
3029
3030
3031
3032 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
3033 btrfs_err(fs_info,
3034 "partial page read in btrfs with offset %u and length %u",
3035 bvec->bv_offset, bvec->bv_len);
3036 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
3037 sectorsize))
3038 btrfs_info(fs_info,
3039 "incomplete page read with offset %u and length %u",
3040 bvec->bv_offset, bvec->bv_len);
3041
3042 start = page_offset(page) + bvec->bv_offset;
3043 end = start + bvec->bv_len - 1;
3044 len = bvec->bv_len;
3045
3046 mirror = bbio->mirror_num;
3047 if (likely(uptodate)) {
3048 if (is_data_inode(inode)) {
3049 error_bitmap = btrfs_verify_data_csum(bbio,
3050 bio_offset, page, start, end);
3051 if (error_bitmap)
3052 uptodate = false;
3053 } else {
3054 if (btrfs_validate_metadata_buffer(bbio,
3055 page, start, end, mirror))
3056 uptodate = false;
3057 }
3058 }
3059
3060 if (likely(uptodate)) {
3061 loff_t i_size = i_size_read(inode);
3062 pgoff_t end_index = i_size >> PAGE_SHIFT;
3063
3064 clean_io_failure(BTRFS_I(inode)->root->fs_info,
3065 failure_tree, tree, start, page,
3066 btrfs_ino(BTRFS_I(inode)), 0);
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077 if (page->index == end_index && i_size <= end) {
3078 u32 zero_start = max(offset_in_page(i_size),
3079 offset_in_page(start));
3080
3081 zero_user_segment(page, zero_start,
3082 offset_in_page(end) + 1);
3083 }
3084 } else if (is_data_inode(inode)) {
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094 if (mirror > 0)
3095 repair = true;
3096 } else {
3097 struct extent_buffer *eb;
3098
3099 eb = find_extent_buffer_readpage(fs_info, page, start);
3100 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3101 eb->read_mirror = mirror;
3102 atomic_dec(&eb->io_pages);
3103 }
3104
3105 if (repair) {
3106
3107
3108
3109
3110 submit_data_read_repair(inode, bbio, bio_offset, bvec,
3111 error_bitmap);
3112 } else {
3113
3114 end_page_read(page, uptodate, start, len);
3115 endio_readpage_release_extent(&processed, BTRFS_I(inode),
3116 start, end, PageUptodate(page));
3117 }
3118
3119 ASSERT(bio_offset + len > bio_offset);
3120 bio_offset += len;
3121
3122 }
3123
3124 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
3125 btrfs_bio_free_csum(bbio);
3126 bio_put(bio);
3127 }
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
3141 {
3142 unsigned int allocated;
3143
3144 for (allocated = 0; allocated < nr_pages;) {
3145 unsigned int last = allocated;
3146
3147 allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
3148
3149 if (allocated == nr_pages)
3150 return 0;
3151
3152
3153
3154
3155
3156
3157 if (allocated == last)
3158 return -ENOMEM;
3159
3160 memalloc_retry_wait(GFP_NOFS);
3161 }
3162 return 0;
3163 }
3164
3165
3166
3167
3168
3169
3170 static inline void btrfs_bio_init(struct btrfs_bio *bbio)
3171 {
3172 memset(bbio, 0, offsetof(struct btrfs_bio, bio));
3173 }
3174
3175
3176
3177
3178
3179
3180 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
3181 {
3182 struct bio *bio;
3183
3184 ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
3185 bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
3186 btrfs_bio_init(btrfs_bio(bio));
3187 return bio;
3188 }
3189
3190 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
3191 {
3192 struct bio *bio;
3193 struct btrfs_bio *bbio;
3194
3195 ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
3196
3197
3198 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
3199 ASSERT(bio);
3200
3201 bbio = btrfs_bio(bio);
3202 btrfs_bio_init(bbio);
3203
3204 bio_trim(bio, offset >> 9, size >> 9);
3205 bbio->iter = bio->bi_iter;
3206 return bio;
3207 }
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226 static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3227 struct page *page,
3228 u64 disk_bytenr, unsigned int size,
3229 unsigned int pg_offset,
3230 enum btrfs_compression_type compress_type)
3231 {
3232 struct bio *bio = bio_ctrl->bio;
3233 u32 bio_size = bio->bi_iter.bi_size;
3234 u32 real_size;
3235 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3236 bool contig = false;
3237 int ret;
3238
3239 ASSERT(bio);
3240
3241 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3242 if (bio_ctrl->compress_type != compress_type)
3243 return 0;
3244
3245
3246 if (bio->bi_iter.bi_size == 0) {
3247
3248 contig = true;
3249 } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
3250 struct bio_vec *bvec = bio_last_bvec_all(bio);
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262 if (bio_end_sector(bio) == sector &&
3263 page_offset(bvec->bv_page) + bvec->bv_offset +
3264 bvec->bv_len == page_offset(page) + pg_offset)
3265 contig = true;
3266 } else {
3267
3268
3269
3270
3271 contig = bio->bi_iter.bi_sector == sector;
3272 }
3273
3274 if (!contig)
3275 return 0;
3276
3277 real_size = min(bio_ctrl->len_to_oe_boundary,
3278 bio_ctrl->len_to_stripe_boundary) - bio_size;
3279 real_size = min(real_size, size);
3280
3281
3282
3283
3284
3285 if (real_size == 0)
3286 return 0;
3287
3288 if (bio_op(bio) == REQ_OP_ZONE_APPEND)
3289 ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
3290 else
3291 ret = bio_add_page(bio, page, real_size, pg_offset);
3292
3293 return ret;
3294 }
3295
3296 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
3297 struct btrfs_inode *inode, u64 file_offset)
3298 {
3299 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3300 struct btrfs_io_geometry geom;
3301 struct btrfs_ordered_extent *ordered;
3302 struct extent_map *em;
3303 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3304 int ret;
3305
3306
3307
3308
3309
3310
3311
3312
3313 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
3314 bio_ctrl->len_to_oe_boundary = U32_MAX;
3315 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3316 return 0;
3317 }
3318 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3319 if (IS_ERR(em))
3320 return PTR_ERR(em);
3321 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3322 logical, &geom);
3323 free_extent_map(em);
3324 if (ret < 0) {
3325 return ret;
3326 }
3327 if (geom.len > U32_MAX)
3328 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3329 else
3330 bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3331
3332 if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
3333 bio_ctrl->len_to_oe_boundary = U32_MAX;
3334 return 0;
3335 }
3336
3337
3338 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
3339 if (!ordered) {
3340 bio_ctrl->len_to_oe_boundary = U32_MAX;
3341 return 0;
3342 }
3343
3344 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3345 ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3346 btrfs_put_ordered_extent(ordered);
3347 return 0;
3348 }
3349
3350 static int alloc_new_bio(struct btrfs_inode *inode,
3351 struct btrfs_bio_ctrl *bio_ctrl,
3352 struct writeback_control *wbc,
3353 blk_opf_t opf,
3354 bio_end_io_t end_io_func,
3355 u64 disk_bytenr, u32 offset, u64 file_offset,
3356 enum btrfs_compression_type compress_type)
3357 {
3358 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3359 struct bio *bio;
3360 int ret;
3361
3362 bio = btrfs_bio_alloc(BIO_MAX_VECS);
3363
3364
3365
3366
3367 if (compress_type != BTRFS_COMPRESS_NONE)
3368 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
3369 else
3370 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
3371 bio_ctrl->bio = bio;
3372 bio_ctrl->compress_type = compress_type;
3373 bio->bi_end_io = end_io_func;
3374 bio->bi_opf = opf;
3375 ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
3376 if (ret < 0)
3377 goto error;
3378
3379 if (wbc) {
3380
3381
3382
3383
3384
3385 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
3386 struct btrfs_device *dev;
3387
3388 dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
3389 fs_info->sectorsize);
3390 if (IS_ERR(dev)) {
3391 ret = PTR_ERR(dev);
3392 goto error;
3393 }
3394
3395 bio_set_dev(bio, dev->bdev);
3396 } else {
3397
3398
3399
3400
3401
3402
3403
3404 bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
3405 }
3406 wbc_init_bio(wbc, bio);
3407 } else {
3408 ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
3409 }
3410 return 0;
3411 error:
3412 bio_ctrl->bio = NULL;
3413 bio->bi_status = errno_to_blk_status(ret);
3414 bio_endio(bio);
3415 return ret;
3416 }
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432 static int submit_extent_page(blk_opf_t opf,
3433 struct writeback_control *wbc,
3434 struct btrfs_bio_ctrl *bio_ctrl,
3435 struct page *page, u64 disk_bytenr,
3436 size_t size, unsigned long pg_offset,
3437 bio_end_io_t end_io_func,
3438 enum btrfs_compression_type compress_type,
3439 bool force_bio_submit)
3440 {
3441 int ret = 0;
3442 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3443 unsigned int cur = pg_offset;
3444
3445 ASSERT(bio_ctrl);
3446
3447 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3448 pg_offset + size <= PAGE_SIZE);
3449 if (force_bio_submit)
3450 submit_one_bio(bio_ctrl);
3451
3452 while (cur < pg_offset + size) {
3453 u32 offset = cur - pg_offset;
3454 int added;
3455
3456
3457 if (!bio_ctrl->bio) {
3458 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
3459 end_io_func, disk_bytenr, offset,
3460 page_offset(page) + cur,
3461 compress_type);
3462 if (ret < 0)
3463 return ret;
3464 }
3465
3466
3467
3468
3469 if (compress_type != BTRFS_COMPRESS_NONE)
3470 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
3471 size - offset, pg_offset + offset,
3472 compress_type);
3473 else
3474 added = btrfs_bio_add_page(bio_ctrl, page,
3475 disk_bytenr + offset, size - offset,
3476 pg_offset + offset, compress_type);
3477
3478
3479 if (!is_data_inode(&inode->vfs_inode))
3480 ASSERT(added == 0 || added == size - offset);
3481
3482
3483 if (wbc && added)
3484 wbc_account_cgroup_owner(wbc, page, added);
3485
3486
3487 if (added < size - offset) {
3488
3489 ASSERT(bio_ctrl->bio->bi_iter.bi_size);
3490 submit_one_bio(bio_ctrl);
3491 }
3492 cur += added;
3493 }
3494 return 0;
3495 }
3496
3497 static int attach_extent_buffer_page(struct extent_buffer *eb,
3498 struct page *page,
3499 struct btrfs_subpage *prealloc)
3500 {
3501 struct btrfs_fs_info *fs_info = eb->fs_info;
3502 int ret = 0;
3503
3504
3505
3506
3507
3508
3509
3510 if (page->mapping)
3511 lockdep_assert_held(&page->mapping->private_lock);
3512
3513 if (fs_info->nodesize >= PAGE_SIZE) {
3514 if (!PagePrivate(page))
3515 attach_page_private(page, eb);
3516 else
3517 WARN_ON(page->private != (unsigned long)eb);
3518 return 0;
3519 }
3520
3521
3522 if (PagePrivate(page)) {
3523 btrfs_free_subpage(prealloc);
3524 return 0;
3525 }
3526
3527 if (prealloc)
3528
3529 attach_page_private(page, prealloc);
3530 else
3531
3532 ret = btrfs_attach_subpage(fs_info, page,
3533 BTRFS_SUBPAGE_METADATA);
3534 return ret;
3535 }
3536
3537 int set_page_extent_mapped(struct page *page)
3538 {
3539 struct btrfs_fs_info *fs_info;
3540
3541 ASSERT(page->mapping);
3542
3543 if (PagePrivate(page))
3544 return 0;
3545
3546 fs_info = btrfs_sb(page->mapping->host->i_sb);
3547
3548 if (btrfs_is_subpage(fs_info, page))
3549 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3550
3551 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3552 return 0;
3553 }
3554
3555 void clear_page_extent_mapped(struct page *page)
3556 {
3557 struct btrfs_fs_info *fs_info;
3558
3559 ASSERT(page->mapping);
3560
3561 if (!PagePrivate(page))
3562 return;
3563
3564 fs_info = btrfs_sb(page->mapping->host->i_sb);
3565 if (btrfs_is_subpage(fs_info, page))
3566 return btrfs_detach_subpage(fs_info, page);
3567
3568 detach_page_private(page);
3569 }
3570
3571 static struct extent_map *
3572 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3573 u64 start, u64 len, struct extent_map **em_cached)
3574 {
3575 struct extent_map *em;
3576
3577 if (em_cached && *em_cached) {
3578 em = *em_cached;
3579 if (extent_map_in_tree(em) && start >= em->start &&
3580 start < extent_map_end(em)) {
3581 refcount_inc(&em->refs);
3582 return em;
3583 }
3584
3585 free_extent_map(em);
3586 *em_cached = NULL;
3587 }
3588
3589 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
3590 if (em_cached && !IS_ERR(em)) {
3591 BUG_ON(*em_cached);
3592 refcount_inc(&em->refs);
3593 *em_cached = em;
3594 }
3595 return em;
3596 }
3597
3598
3599
3600
3601
3602
3603
3604 static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3605 struct btrfs_bio_ctrl *bio_ctrl,
3606 blk_opf_t read_flags, u64 *prev_em_start)
3607 {
3608 struct inode *inode = page->mapping->host;
3609 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3610 u64 start = page_offset(page);
3611 const u64 end = start + PAGE_SIZE - 1;
3612 u64 cur = start;
3613 u64 extent_offset;
3614 u64 last_byte = i_size_read(inode);
3615 u64 block_start;
3616 u64 cur_end;
3617 struct extent_map *em;
3618 int ret = 0;
3619 size_t pg_offset = 0;
3620 size_t iosize;
3621 size_t blocksize = inode->i_sb->s_blocksize;
3622 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3623
3624 ret = set_page_extent_mapped(page);
3625 if (ret < 0) {
3626 unlock_extent(tree, start, end);
3627 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3628 unlock_page(page);
3629 goto out;
3630 }
3631
3632 if (page->index == last_byte >> PAGE_SHIFT) {
3633 size_t zero_offset = offset_in_page(last_byte);
3634
3635 if (zero_offset) {
3636 iosize = PAGE_SIZE - zero_offset;
3637 memzero_page(page, zero_offset, iosize);
3638 }
3639 }
3640 begin_page_read(fs_info, page);
3641 while (cur <= end) {
3642 unsigned long this_bio_flag = 0;
3643 bool force_bio_submit = false;
3644 u64 disk_bytenr;
3645
3646 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
3647 if (cur >= last_byte) {
3648 struct extent_state *cached = NULL;
3649
3650 iosize = PAGE_SIZE - pg_offset;
3651 memzero_page(page, pg_offset, iosize);
3652 set_extent_uptodate(tree, cur, cur + iosize - 1,
3653 &cached, GFP_NOFS);
3654 unlock_extent_cached(tree, cur,
3655 cur + iosize - 1, &cached);
3656 end_page_read(page, true, cur, iosize);
3657 break;
3658 }
3659 em = __get_extent_map(inode, page, pg_offset, cur,
3660 end - cur + 1, em_cached);
3661 if (IS_ERR(em)) {
3662 unlock_extent(tree, cur, end);
3663 end_page_read(page, false, cur, end + 1 - cur);
3664 ret = PTR_ERR(em);
3665 break;
3666 }
3667 extent_offset = cur - em->start;
3668 BUG_ON(extent_map_end(em) <= cur);
3669 BUG_ON(end < cur);
3670
3671 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3672 this_bio_flag = em->compress_type;
3673
3674 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3675 cur_end = min(extent_map_end(em) - 1, end);
3676 iosize = ALIGN(iosize, blocksize);
3677 if (this_bio_flag != BTRFS_COMPRESS_NONE)
3678 disk_bytenr = em->block_start;
3679 else
3680 disk_bytenr = em->block_start + extent_offset;
3681 block_start = em->block_start;
3682 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3683 block_start = EXTENT_MAP_HOLE;
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3720 prev_em_start && *prev_em_start != (u64)-1 &&
3721 *prev_em_start != em->start)
3722 force_bio_submit = true;
3723
3724 if (prev_em_start)
3725 *prev_em_start = em->start;
3726
3727 free_extent_map(em);
3728 em = NULL;
3729
3730
3731 if (block_start == EXTENT_MAP_HOLE) {
3732 struct extent_state *cached = NULL;
3733
3734 memzero_page(page, pg_offset, iosize);
3735
3736 set_extent_uptodate(tree, cur, cur + iosize - 1,
3737 &cached, GFP_NOFS);
3738 unlock_extent_cached(tree, cur,
3739 cur + iosize - 1, &cached);
3740 end_page_read(page, true, cur, iosize);
3741 cur = cur + iosize;
3742 pg_offset += iosize;
3743 continue;
3744 }
3745
3746 if (test_range_bit(tree, cur, cur_end,
3747 EXTENT_UPTODATE, 1, NULL)) {
3748 unlock_extent(tree, cur, cur + iosize - 1);
3749 end_page_read(page, true, cur, iosize);
3750 cur = cur + iosize;
3751 pg_offset += iosize;
3752 continue;
3753 }
3754
3755
3756
3757 if (block_start == EXTENT_MAP_INLINE) {
3758 unlock_extent(tree, cur, cur + iosize - 1);
3759 end_page_read(page, false, cur, iosize);
3760 cur = cur + iosize;
3761 pg_offset += iosize;
3762 continue;
3763 }
3764
3765 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
3766 bio_ctrl, page, disk_bytenr, iosize,
3767 pg_offset, end_bio_extent_readpage,
3768 this_bio_flag, force_bio_submit);
3769 if (ret) {
3770
3771
3772
3773
3774 unlock_extent(tree, cur, end);
3775 end_page_read(page, false, cur, end + 1 - cur);
3776 goto out;
3777 }
3778 cur = cur + iosize;
3779 pg_offset += iosize;
3780 }
3781 out:
3782 return ret;
3783 }
3784
3785 int btrfs_read_folio(struct file *file, struct folio *folio)
3786 {
3787 struct page *page = &folio->page;
3788 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3789 u64 start = page_offset(page);
3790 u64 end = start + PAGE_SIZE - 1;
3791 struct btrfs_bio_ctrl bio_ctrl = { 0 };
3792 int ret;
3793
3794 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3795
3796 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
3797
3798
3799
3800
3801 submit_one_bio(&bio_ctrl);
3802 return ret;
3803 }
3804
3805 static inline void contiguous_readpages(struct page *pages[], int nr_pages,
3806 u64 start, u64 end,
3807 struct extent_map **em_cached,
3808 struct btrfs_bio_ctrl *bio_ctrl,
3809 u64 *prev_em_start)
3810 {
3811 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3812 int index;
3813
3814 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3815
3816 for (index = 0; index < nr_pages; index++) {
3817 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
3818 REQ_RAHEAD, prev_em_start);
3819 put_page(pages[index]);
3820 }
3821 }
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
3834 struct page *page, struct writeback_control *wbc)
3835 {
3836 const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
3837 u64 delalloc_start = page_offset(page);
3838 u64 delalloc_to_write = 0;
3839
3840 unsigned long nr_written = 0;
3841 int ret;
3842 int page_started = 0;
3843
3844 while (delalloc_start < page_end) {
3845 u64 delalloc_end = page_end;
3846 bool found;
3847
3848 found = find_lock_delalloc_range(&inode->vfs_inode, page,
3849 &delalloc_start,
3850 &delalloc_end);
3851 if (!found) {
3852 delalloc_start = delalloc_end + 1;
3853 continue;
3854 }
3855 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3856 delalloc_end, &page_started, &nr_written, wbc);
3857 if (ret) {
3858 btrfs_page_set_error(inode->root->fs_info, page,
3859 page_offset(page), PAGE_SIZE);
3860 return ret;
3861 }
3862
3863
3864
3865
3866 delalloc_to_write += (delalloc_end - delalloc_start +
3867 PAGE_SIZE) >> PAGE_SHIFT;
3868 delalloc_start = delalloc_end + 1;
3869 }
3870 if (wbc->nr_to_write < delalloc_to_write) {
3871 int thresh = 8192;
3872
3873 if (delalloc_to_write < thresh * 2)
3874 thresh = delalloc_to_write;
3875 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3876 thresh);
3877 }
3878
3879
3880 if (page_started) {
3881
3882
3883
3884
3885 wbc->nr_to_write -= nr_written;
3886 return 1;
3887 }
3888
3889 return 0;
3890 }
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
3908 struct page *page, u64 *start, u64 *end)
3909 {
3910 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
3911 struct btrfs_subpage_info *spi = fs_info->subpage_info;
3912 u64 orig_start = *start;
3913
3914 unsigned long flags;
3915 int range_start_bit;
3916 int range_end_bit;
3917
3918
3919
3920
3921
3922 if (!btrfs_is_subpage(fs_info, page)) {
3923 *start = page_offset(page);
3924 *end = page_offset(page) + PAGE_SIZE;
3925 return;
3926 }
3927
3928 range_start_bit = spi->dirty_offset +
3929 (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
3930
3931
3932 spin_lock_irqsave(&subpage->lock, flags);
3933 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
3934 spi->dirty_offset + spi->bitmap_nr_bits);
3935 spin_unlock_irqrestore(&subpage->lock, flags);
3936
3937 range_start_bit -= spi->dirty_offset;
3938 range_end_bit -= spi->dirty_offset;
3939
3940 *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
3941 *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
3942 }
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
3953 struct page *page,
3954 struct writeback_control *wbc,
3955 struct extent_page_data *epd,
3956 loff_t i_size,
3957 int *nr_ret)
3958 {
3959 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3960 u64 cur = page_offset(page);
3961 u64 end = cur + PAGE_SIZE - 1;
3962 u64 extent_offset;
3963 u64 block_start;
3964 struct extent_map *em;
3965 int saved_ret = 0;
3966 int ret = 0;
3967 int nr = 0;
3968 enum req_op op = REQ_OP_WRITE;
3969 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
3970 bool has_error = false;
3971 bool compressed;
3972
3973 ret = btrfs_writepage_cow_fixup(page);
3974 if (ret) {
3975
3976 redirty_page_for_writepage(wbc, page);
3977 unlock_page(page);
3978 return 1;
3979 }
3980
3981
3982
3983
3984
3985 wbc->nr_to_write--;
3986
3987 while (cur <= end) {
3988 u64 disk_bytenr;
3989 u64 em_end;
3990 u64 dirty_range_start = cur;
3991 u64 dirty_range_end;
3992 u32 iosize;
3993
3994 if (cur >= i_size) {
3995 btrfs_writepage_endio_finish_ordered(inode, page, cur,
3996 end, true);
3997
3998
3999
4000
4001
4002
4003
4004
4005 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
4006 break;
4007 }
4008
4009 find_next_dirty_byte(fs_info, page, &dirty_range_start,
4010 &dirty_range_end);
4011 if (cur < dirty_range_start) {
4012 cur = dirty_range_start;
4013 continue;
4014 }
4015
4016 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
4017 if (IS_ERR(em)) {
4018 btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
4019 ret = PTR_ERR_OR_ZERO(em);
4020 has_error = true;
4021 if (!saved_ret)
4022 saved_ret = ret;
4023 break;
4024 }
4025
4026 extent_offset = cur - em->start;
4027 em_end = extent_map_end(em);
4028 ASSERT(cur <= em_end);
4029 ASSERT(cur < end);
4030 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
4031 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
4032 block_start = em->block_start;
4033 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4034 disk_bytenr = em->block_start + extent_offset;
4035
4036
4037
4038
4039
4040 iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
4041
4042 if (btrfs_use_zone_append(inode, em->block_start))
4043 op = REQ_OP_ZONE_APPEND;
4044
4045 free_extent_map(em);
4046 em = NULL;
4047
4048
4049
4050
4051
4052 if (compressed || block_start == EXTENT_MAP_HOLE ||
4053 block_start == EXTENT_MAP_INLINE) {
4054 if (compressed)
4055 nr++;
4056 else
4057 btrfs_writepage_endio_finish_ordered(inode,
4058 page, cur, cur + iosize - 1, true);
4059 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4060 cur += iosize;
4061 continue;
4062 }
4063
4064 btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
4065 if (!PageWriteback(page)) {
4066 btrfs_err(inode->root->fs_info,
4067 "page %lu not writeback, cur %llu end %llu",
4068 page->index, cur, end);
4069 }
4070
4071
4072
4073
4074
4075
4076
4077 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4078
4079 ret = submit_extent_page(op | write_flags, wbc,
4080 &epd->bio_ctrl, page,
4081 disk_bytenr, iosize,
4082 cur - page_offset(page),
4083 end_bio_extent_writepage,
4084 0, false);
4085 if (ret) {
4086 has_error = true;
4087 if (!saved_ret)
4088 saved_ret = ret;
4089
4090 btrfs_page_set_error(fs_info, page, cur, iosize);
4091 if (PageWriteback(page))
4092 btrfs_page_clear_writeback(fs_info, page, cur,
4093 iosize);
4094 }
4095
4096 cur += iosize;
4097 nr++;
4098 }
4099
4100
4101
4102
4103 if (!has_error)
4104 btrfs_page_assert_not_dirty(fs_info, page);
4105 else
4106 ret = saved_ret;
4107 *nr_ret = nr;
4108 return ret;
4109 }
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
4121 struct extent_page_data *epd)
4122 {
4123 struct folio *folio = page_folio(page);
4124 struct inode *inode = page->mapping->host;
4125 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4126 const u64 page_start = page_offset(page);
4127 const u64 page_end = page_start + PAGE_SIZE - 1;
4128 int ret;
4129 int nr = 0;
4130 size_t pg_offset;
4131 loff_t i_size = i_size_read(inode);
4132 unsigned long end_index = i_size >> PAGE_SHIFT;
4133
4134 trace___extent_writepage(page, inode, wbc);
4135
4136 WARN_ON(!PageLocked(page));
4137
4138 btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
4139 page_offset(page), PAGE_SIZE);
4140
4141 pg_offset = offset_in_page(i_size);
4142 if (page->index > end_index ||
4143 (page->index == end_index && !pg_offset)) {
4144 folio_invalidate(folio, 0, folio_size(folio));
4145 folio_unlock(folio);
4146 return 0;
4147 }
4148
4149 if (page->index == end_index)
4150 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
4151
4152 ret = set_page_extent_mapped(page);
4153 if (ret < 0) {
4154 SetPageError(page);
4155 goto done;
4156 }
4157
4158 if (!epd->extent_locked) {
4159 ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
4160 if (ret == 1)
4161 return 0;
4162 if (ret)
4163 goto done;
4164 }
4165
4166 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
4167 &nr);
4168 if (ret == 1)
4169 return 0;
4170
4171 done:
4172 if (nr == 0) {
4173
4174 set_page_writeback(page);
4175 end_page_writeback(page);
4176 }
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208 if (PageError(page))
4209 end_extent_writepage(page, ret, page_start, page_end);
4210 if (epd->extent_locked) {
4211
4212
4213
4214
4215
4216
4217 ASSERT(wbc);
4218 btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
4219 wbc->range_end + 1 - wbc->range_start);
4220 } else {
4221 unlock_page(page);
4222 }
4223 ASSERT(ret <= 0);
4224 return ret;
4225 }
4226
4227 void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
4228 {
4229 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
4230 TASK_UNINTERRUPTIBLE);
4231 }
4232
4233 static void end_extent_buffer_writeback(struct extent_buffer *eb)
4234 {
4235 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4236 smp_mb__after_atomic();
4237 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
4238 }
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
4251 struct extent_page_data *epd)
4252 {
4253 struct btrfs_fs_info *fs_info = eb->fs_info;
4254 int i, num_pages;
4255 int flush = 0;
4256 int ret = 0;
4257
4258 if (!btrfs_try_tree_write_lock(eb)) {
4259 submit_write_bio(epd, 0);
4260 flush = 1;
4261 btrfs_tree_lock(eb);
4262 }
4263
4264 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
4265 btrfs_tree_unlock(eb);
4266 if (!epd->sync_io)
4267 return 0;
4268 if (!flush) {
4269 submit_write_bio(epd, 0);
4270 flush = 1;
4271 }
4272 while (1) {
4273 wait_on_extent_buffer_writeback(eb);
4274 btrfs_tree_lock(eb);
4275 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
4276 break;
4277 btrfs_tree_unlock(eb);
4278 }
4279 }
4280
4281
4282
4283
4284
4285
4286 spin_lock(&eb->refs_lock);
4287 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4288 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4289 spin_unlock(&eb->refs_lock);
4290 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4291 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4292 -eb->len,
4293 fs_info->dirty_metadata_batch);
4294 ret = 1;
4295 } else {
4296 spin_unlock(&eb->refs_lock);
4297 }
4298
4299 btrfs_tree_unlock(eb);
4300
4301
4302
4303
4304
4305
4306
4307 if (!ret || fs_info->nodesize < PAGE_SIZE)
4308 return ret;
4309
4310 num_pages = num_extent_pages(eb);
4311 for (i = 0; i < num_pages; i++) {
4312 struct page *p = eb->pages[i];
4313
4314 if (!trylock_page(p)) {
4315 if (!flush) {
4316 submit_write_bio(epd, 0);
4317 flush = 1;
4318 }
4319 lock_page(p);
4320 }
4321 }
4322
4323 return ret;
4324 }
4325
4326 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
4327 {
4328 struct btrfs_fs_info *fs_info = eb->fs_info;
4329
4330 btrfs_page_set_error(fs_info, page, eb->start, eb->len);
4331 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4332 return;
4333
4334
4335
4336
4337
4338 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4339
4340
4341
4342
4343
4344
4345
4346 mapping_set_error(page->mapping, -EIO);
4347
4348
4349
4350
4351
4352 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4353 eb->len, fs_info->dirty_metadata_batch);
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393 switch (eb->log_index) {
4394 case -1:
4395 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
4396 break;
4397 case 0:
4398 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
4399 break;
4400 case 1:
4401 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
4402 break;
4403 default:
4404 BUG();
4405 }
4406 }
4407
4408
4409
4410
4411
4412 static struct extent_buffer *find_extent_buffer_nolock(
4413 struct btrfs_fs_info *fs_info, u64 start)
4414 {
4415 struct extent_buffer *eb;
4416
4417 rcu_read_lock();
4418 eb = radix_tree_lookup(&fs_info->buffer_radix,
4419 start >> fs_info->sectorsize_bits);
4420 if (eb && atomic_inc_not_zero(&eb->refs)) {
4421 rcu_read_unlock();
4422 return eb;
4423 }
4424 rcu_read_unlock();
4425 return NULL;
4426 }
4427
4428
4429
4430
4431
4432
4433
4434 static void end_bio_subpage_eb_writepage(struct bio *bio)
4435 {
4436 struct btrfs_fs_info *fs_info;
4437 struct bio_vec *bvec;
4438 struct bvec_iter_all iter_all;
4439
4440 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
4441 ASSERT(fs_info->nodesize < PAGE_SIZE);
4442
4443 ASSERT(!bio_flagged(bio, BIO_CLONED));
4444 bio_for_each_segment_all(bvec, bio, iter_all) {
4445 struct page *page = bvec->bv_page;
4446 u64 bvec_start = page_offset(page) + bvec->bv_offset;
4447 u64 bvec_end = bvec_start + bvec->bv_len - 1;
4448 u64 cur_bytenr = bvec_start;
4449
4450 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4451
4452
4453 while (cur_bytenr <= bvec_end) {
4454 struct extent_buffer *eb;
4455 int done;
4456
4457
4458
4459
4460
4461
4462 eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4463 ASSERT(eb);
4464
4465 cur_bytenr = eb->start + eb->len;
4466
4467 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4468 done = atomic_dec_and_test(&eb->io_pages);
4469 ASSERT(done);
4470
4471 if (bio->bi_status ||
4472 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4473 ClearPageUptodate(page);
4474 set_btree_ioerr(page, eb);
4475 }
4476
4477 btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4478 eb->len);
4479 end_extent_buffer_writeback(eb);
4480
4481
4482
4483
4484
4485 atomic_dec(&eb->refs);
4486 }
4487 }
4488 bio_put(bio);
4489 }
4490
4491 static void end_bio_extent_buffer_writepage(struct bio *bio)
4492 {
4493 struct bio_vec *bvec;
4494 struct extent_buffer *eb;
4495 int done;
4496 struct bvec_iter_all iter_all;
4497
4498 ASSERT(!bio_flagged(bio, BIO_CLONED));
4499 bio_for_each_segment_all(bvec, bio, iter_all) {
4500 struct page *page = bvec->bv_page;
4501
4502 eb = (struct extent_buffer *)page->private;
4503 BUG_ON(!eb);
4504 done = atomic_dec_and_test(&eb->io_pages);
4505
4506 if (bio->bi_status ||
4507 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4508 ClearPageUptodate(page);
4509 set_btree_ioerr(page, eb);
4510 }
4511
4512 end_page_writeback(page);
4513
4514 if (!done)
4515 continue;
4516
4517 end_extent_buffer_writeback(eb);
4518 }
4519
4520 bio_put(bio);
4521 }
4522
4523 static void prepare_eb_write(struct extent_buffer *eb)
4524 {
4525 u32 nritems;
4526 unsigned long start;
4527 unsigned long end;
4528
4529 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4530 atomic_set(&eb->io_pages, num_extent_pages(eb));
4531
4532
4533 nritems = btrfs_header_nritems(eb);
4534 if (btrfs_header_level(eb) > 0) {
4535 end = btrfs_node_key_ptr_offset(nritems);
4536 memzero_extent_buffer(eb, end, eb->len - end);
4537 } else {
4538
4539
4540
4541
4542 start = btrfs_item_nr_offset(nritems);
4543 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4544 memzero_extent_buffer(eb, start, end - start);
4545 }
4546 }
4547
4548
4549
4550
4551
4552 static int write_one_subpage_eb(struct extent_buffer *eb,
4553 struct writeback_control *wbc,
4554 struct extent_page_data *epd)
4555 {
4556 struct btrfs_fs_info *fs_info = eb->fs_info;
4557 struct page *page = eb->pages[0];
4558 blk_opf_t write_flags = wbc_to_write_flags(wbc);
4559 bool no_dirty_ebs = false;
4560 int ret;
4561
4562 prepare_eb_write(eb);
4563
4564
4565 lock_page(page);
4566 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4567
4568
4569 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4570 eb->start, eb->len);
4571 if (no_dirty_ebs)
4572 clear_page_dirty_for_io(page);
4573
4574 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4575 &epd->bio_ctrl, page, eb->start, eb->len,
4576 eb->start - page_offset(page),
4577 end_bio_subpage_eb_writepage, 0, false);
4578 if (ret) {
4579 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4580 set_btree_ioerr(page, eb);
4581 unlock_page(page);
4582
4583 if (atomic_dec_and_test(&eb->io_pages))
4584 end_extent_buffer_writeback(eb);
4585 return -EIO;
4586 }
4587 unlock_page(page);
4588
4589
4590
4591
4592 if (no_dirty_ebs)
4593 wbc->nr_to_write--;
4594 return ret;
4595 }
4596
4597 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
4598 struct writeback_control *wbc,
4599 struct extent_page_data *epd)
4600 {
4601 u64 disk_bytenr = eb->start;
4602 int i, num_pages;
4603 blk_opf_t write_flags = wbc_to_write_flags(wbc);
4604 int ret = 0;
4605
4606 prepare_eb_write(eb);
4607
4608 num_pages = num_extent_pages(eb);
4609 for (i = 0; i < num_pages; i++) {
4610 struct page *p = eb->pages[i];
4611
4612 clear_page_dirty_for_io(p);
4613 set_page_writeback(p);
4614 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4615 &epd->bio_ctrl, p, disk_bytenr,
4616 PAGE_SIZE, 0,
4617 end_bio_extent_buffer_writepage,
4618 0, false);
4619 if (ret) {
4620 set_btree_ioerr(p, eb);
4621 if (PageWriteback(p))
4622 end_page_writeback(p);
4623 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4624 end_extent_buffer_writeback(eb);
4625 ret = -EIO;
4626 break;
4627 }
4628 disk_bytenr += PAGE_SIZE;
4629 wbc->nr_to_write--;
4630 unlock_page(p);
4631 }
4632
4633 if (unlikely(ret)) {
4634 for (; i < num_pages; i++) {
4635 struct page *p = eb->pages[i];
4636 clear_page_dirty_for_io(p);
4637 unlock_page(p);
4638 }
4639 }
4640
4641 return ret;
4642 }
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658 static int submit_eb_subpage(struct page *page,
4659 struct writeback_control *wbc,
4660 struct extent_page_data *epd)
4661 {
4662 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4663 int submitted = 0;
4664 u64 page_start = page_offset(page);
4665 int bit_start = 0;
4666 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4667 int ret;
4668
4669
4670 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
4671 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4672 struct extent_buffer *eb;
4673 unsigned long flags;
4674 u64 start;
4675
4676
4677
4678
4679
4680 spin_lock(&page->mapping->private_lock);
4681 if (!PagePrivate(page)) {
4682 spin_unlock(&page->mapping->private_lock);
4683 break;
4684 }
4685 spin_lock_irqsave(&subpage->lock, flags);
4686 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
4687 subpage->bitmaps)) {
4688 spin_unlock_irqrestore(&subpage->lock, flags);
4689 spin_unlock(&page->mapping->private_lock);
4690 bit_start++;
4691 continue;
4692 }
4693
4694 start = page_start + bit_start * fs_info->sectorsize;
4695 bit_start += sectors_per_node;
4696
4697
4698
4699
4700
4701 eb = find_extent_buffer_nolock(fs_info, start);
4702 spin_unlock_irqrestore(&subpage->lock, flags);
4703 spin_unlock(&page->mapping->private_lock);
4704
4705
4706
4707
4708
4709
4710 if (!eb)
4711 continue;
4712
4713 ret = lock_extent_buffer_for_io(eb, epd);
4714 if (ret == 0) {
4715 free_extent_buffer(eb);
4716 continue;
4717 }
4718 if (ret < 0) {
4719 free_extent_buffer(eb);
4720 goto cleanup;
4721 }
4722 ret = write_one_subpage_eb(eb, wbc, epd);
4723 free_extent_buffer(eb);
4724 if (ret < 0)
4725 goto cleanup;
4726 submitted++;
4727 }
4728 return submitted;
4729
4730 cleanup:
4731
4732 submit_write_bio(epd, ret);
4733 return ret;
4734 }
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756 static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4757 struct extent_page_data *epd,
4758 struct extent_buffer **eb_context)
4759 {
4760 struct address_space *mapping = page->mapping;
4761 struct btrfs_block_group *cache = NULL;
4762 struct extent_buffer *eb;
4763 int ret;
4764
4765 if (!PagePrivate(page))
4766 return 0;
4767
4768 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
4769 return submit_eb_subpage(page, wbc, epd);
4770
4771 spin_lock(&mapping->private_lock);
4772 if (!PagePrivate(page)) {
4773 spin_unlock(&mapping->private_lock);
4774 return 0;
4775 }
4776
4777 eb = (struct extent_buffer *)page->private;
4778
4779
4780
4781
4782
4783 if (WARN_ON(!eb)) {
4784 spin_unlock(&mapping->private_lock);
4785 return 0;
4786 }
4787
4788 if (eb == *eb_context) {
4789 spin_unlock(&mapping->private_lock);
4790 return 0;
4791 }
4792 ret = atomic_inc_not_zero(&eb->refs);
4793 spin_unlock(&mapping->private_lock);
4794 if (!ret)
4795 return 0;
4796
4797 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4798
4799
4800
4801
4802 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4803 ret = -EAGAIN;
4804 else
4805 ret = 0;
4806 free_extent_buffer(eb);
4807 return ret;
4808 }
4809
4810 *eb_context = eb;
4811
4812 ret = lock_extent_buffer_for_io(eb, epd);
4813 if (ret <= 0) {
4814 btrfs_revert_meta_write_pointer(cache, eb);
4815 if (cache)
4816 btrfs_put_block_group(cache);
4817 free_extent_buffer(eb);
4818 return ret;
4819 }
4820 if (cache) {
4821
4822
4823
4824 btrfs_schedule_zone_finish_bg(cache, eb);
4825 btrfs_put_block_group(cache);
4826 }
4827 ret = write_one_eb(eb, wbc, epd);
4828 free_extent_buffer(eb);
4829 if (ret < 0)
4830 return ret;
4831 return 1;
4832 }
4833
4834 int btree_write_cache_pages(struct address_space *mapping,
4835 struct writeback_control *wbc)
4836 {
4837 struct extent_buffer *eb_context = NULL;
4838 struct extent_page_data epd = {
4839 .bio_ctrl = { 0 },
4840 .extent_locked = 0,
4841 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4842 };
4843 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
4844 int ret = 0;
4845 int done = 0;
4846 int nr_to_write_done = 0;
4847 struct pagevec pvec;
4848 int nr_pages;
4849 pgoff_t index;
4850 pgoff_t end;
4851 int scanned = 0;
4852 xa_mark_t tag;
4853
4854 pagevec_init(&pvec);
4855 if (wbc->range_cyclic) {
4856 index = mapping->writeback_index;
4857 end = -1;
4858
4859
4860
4861
4862 scanned = (index == 0);
4863 } else {
4864 index = wbc->range_start >> PAGE_SHIFT;
4865 end = wbc->range_end >> PAGE_SHIFT;
4866 scanned = 1;
4867 }
4868 if (wbc->sync_mode == WB_SYNC_ALL)
4869 tag = PAGECACHE_TAG_TOWRITE;
4870 else
4871 tag = PAGECACHE_TAG_DIRTY;
4872 btrfs_zoned_meta_io_lock(fs_info);
4873 retry:
4874 if (wbc->sync_mode == WB_SYNC_ALL)
4875 tag_pages_for_writeback(mapping, index, end);
4876 while (!done && !nr_to_write_done && (index <= end) &&
4877 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
4878 tag))) {
4879 unsigned i;
4880
4881 for (i = 0; i < nr_pages; i++) {
4882 struct page *page = pvec.pages[i];
4883
4884 ret = submit_eb_page(page, wbc, &epd, &eb_context);
4885 if (ret == 0)
4886 continue;
4887 if (ret < 0) {
4888 done = 1;
4889 break;
4890 }
4891
4892
4893
4894
4895
4896
4897 nr_to_write_done = wbc->nr_to_write <= 0;
4898 }
4899 pagevec_release(&pvec);
4900 cond_resched();
4901 }
4902 if (!scanned && !done) {
4903
4904
4905
4906
4907 scanned = 1;
4908 index = 0;
4909 goto retry;
4910 }
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941 if (ret > 0)
4942 ret = 0;
4943 if (!ret && BTRFS_FS_ERROR(fs_info))
4944 ret = -EROFS;
4945 submit_write_bio(&epd, ret);
4946
4947 btrfs_zoned_meta_io_unlock(fs_info);
4948 return ret;
4949 }
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966 static int extent_write_cache_pages(struct address_space *mapping,
4967 struct writeback_control *wbc,
4968 struct extent_page_data *epd)
4969 {
4970 struct inode *inode = mapping->host;
4971 int ret = 0;
4972 int done = 0;
4973 int nr_to_write_done = 0;
4974 struct pagevec pvec;
4975 int nr_pages;
4976 pgoff_t index;
4977 pgoff_t end;
4978 pgoff_t done_index;
4979 int range_whole = 0;
4980 int scanned = 0;
4981 xa_mark_t tag;
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992 if (!igrab(inode))
4993 return 0;
4994
4995 pagevec_init(&pvec);
4996 if (wbc->range_cyclic) {
4997 index = mapping->writeback_index;
4998 end = -1;
4999
5000
5001
5002
5003 scanned = (index == 0);
5004 } else {
5005 index = wbc->range_start >> PAGE_SHIFT;
5006 end = wbc->range_end >> PAGE_SHIFT;
5007 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
5008 range_whole = 1;
5009 scanned = 1;
5010 }
5011
5012
5013
5014
5015
5016
5017
5018
5019 if (range_whole && wbc->nr_to_write == LONG_MAX &&
5020 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
5021 &BTRFS_I(inode)->runtime_flags))
5022 wbc->tagged_writepages = 1;
5023
5024 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5025 tag = PAGECACHE_TAG_TOWRITE;
5026 else
5027 tag = PAGECACHE_TAG_DIRTY;
5028 retry:
5029 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5030 tag_pages_for_writeback(mapping, index, end);
5031 done_index = index;
5032 while (!done && !nr_to_write_done && (index <= end) &&
5033 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
5034 &index, end, tag))) {
5035 unsigned i;
5036
5037 for (i = 0; i < nr_pages; i++) {
5038 struct page *page = pvec.pages[i];
5039
5040 done_index = page->index + 1;
5041
5042
5043
5044
5045
5046
5047
5048 if (!trylock_page(page)) {
5049 submit_write_bio(epd, 0);
5050 lock_page(page);
5051 }
5052
5053 if (unlikely(page->mapping != mapping)) {
5054 unlock_page(page);
5055 continue;
5056 }
5057
5058 if (wbc->sync_mode != WB_SYNC_NONE) {
5059 if (PageWriteback(page))
5060 submit_write_bio(epd, 0);
5061 wait_on_page_writeback(page);
5062 }
5063
5064 if (PageWriteback(page) ||
5065 !clear_page_dirty_for_io(page)) {
5066 unlock_page(page);
5067 continue;
5068 }
5069
5070 ret = __extent_writepage(page, wbc, epd);
5071 if (ret < 0) {
5072 done = 1;
5073 break;
5074 }
5075
5076
5077
5078
5079
5080
5081 nr_to_write_done = wbc->nr_to_write <= 0;
5082 }
5083 pagevec_release(&pvec);
5084 cond_resched();
5085 }
5086 if (!scanned && !done) {
5087
5088
5089
5090
5091 scanned = 1;
5092 index = 0;
5093
5094
5095
5096
5097
5098
5099
5100 submit_write_bio(epd, 0);
5101 goto retry;
5102 }
5103
5104 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
5105 mapping->writeback_index = done_index;
5106
5107 btrfs_add_delayed_iput(inode);
5108 return ret;
5109 }
5110
5111
5112
5113
5114
5115
5116 int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
5117 {
5118 bool found_error = false;
5119 int first_error = 0;
5120 int ret = 0;
5121 struct address_space *mapping = inode->i_mapping;
5122 struct page *page;
5123 u64 cur = start;
5124 unsigned long nr_pages;
5125 const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
5126 struct extent_page_data epd = {
5127 .bio_ctrl = { 0 },
5128 .extent_locked = 1,
5129 .sync_io = 1,
5130 };
5131 struct writeback_control wbc_writepages = {
5132 .sync_mode = WB_SYNC_ALL,
5133 .range_start = start,
5134 .range_end = end + 1,
5135
5136 .punt_to_cgroup = 1,
5137 .no_cgroup_owner = 1,
5138 };
5139
5140 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
5141 nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
5142 PAGE_SHIFT;
5143 wbc_writepages.nr_to_write = nr_pages * 2;
5144
5145 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
5146 while (cur <= end) {
5147 u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
5148
5149 page = find_get_page(mapping, cur >> PAGE_SHIFT);
5150
5151
5152
5153
5154
5155 ASSERT(PageLocked(page));
5156 ASSERT(PageDirty(page));
5157 clear_page_dirty_for_io(page);
5158 ret = __extent_writepage(page, &wbc_writepages, &epd);
5159 ASSERT(ret <= 0);
5160 if (ret < 0) {
5161 found_error = true;
5162 first_error = ret;
5163 }
5164 put_page(page);
5165 cur = cur_end + 1;
5166 }
5167
5168 submit_write_bio(&epd, found_error ? ret : 0);
5169
5170 wbc_detach_inode(&wbc_writepages);
5171 if (found_error)
5172 return first_error;
5173 return ret;
5174 }
5175
5176 int extent_writepages(struct address_space *mapping,
5177 struct writeback_control *wbc)
5178 {
5179 struct inode *inode = mapping->host;
5180 int ret = 0;
5181 struct extent_page_data epd = {
5182 .bio_ctrl = { 0 },
5183 .extent_locked = 0,
5184 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
5185 };
5186
5187
5188
5189
5190
5191 btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
5192 ret = extent_write_cache_pages(mapping, wbc, &epd);
5193 submit_write_bio(&epd, ret);
5194 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
5195 return ret;
5196 }
5197
5198 void extent_readahead(struct readahead_control *rac)
5199 {
5200 struct btrfs_bio_ctrl bio_ctrl = { 0 };
5201 struct page *pagepool[16];
5202 struct extent_map *em_cached = NULL;
5203 u64 prev_em_start = (u64)-1;
5204 int nr;
5205
5206 while ((nr = readahead_page_batch(rac, pagepool))) {
5207 u64 contig_start = readahead_pos(rac);
5208 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
5209
5210 contiguous_readpages(pagepool, nr, contig_start, contig_end,
5211 &em_cached, &bio_ctrl, &prev_em_start);
5212 }
5213
5214 if (em_cached)
5215 free_extent_map(em_cached);
5216 submit_one_bio(&bio_ctrl);
5217 }
5218
5219
5220
5221
5222
5223
5224 int extent_invalidate_folio(struct extent_io_tree *tree,
5225 struct folio *folio, size_t offset)
5226 {
5227 struct extent_state *cached_state = NULL;
5228 u64 start = folio_pos(folio);
5229 u64 end = start + folio_size(folio) - 1;
5230 size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
5231
5232
5233 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
5234
5235 start += ALIGN(offset, blocksize);
5236 if (start > end)
5237 return 0;
5238
5239 lock_extent_bits(tree, start, end, &cached_state);
5240 folio_wait_writeback(folio);
5241
5242
5243
5244
5245
5246
5247 unlock_extent_cached(tree, start, end, &cached_state);
5248 return 0;
5249 }
5250
5251
5252
5253
5254
5255
5256 static int try_release_extent_state(struct extent_io_tree *tree,
5257 struct page *page, gfp_t mask)
5258 {
5259 u64 start = page_offset(page);
5260 u64 end = start + PAGE_SIZE - 1;
5261 int ret = 1;
5262
5263 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
5264 ret = 0;
5265 } else {
5266
5267
5268
5269
5270
5271
5272 ret = __clear_extent_bit(tree, start, end,
5273 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5274 0, 0, NULL, mask, NULL);
5275
5276
5277
5278
5279 if (ret < 0)
5280 ret = 0;
5281 else
5282 ret = 1;
5283 }
5284 return ret;
5285 }
5286
5287
5288
5289
5290
5291
5292 int try_release_extent_mapping(struct page *page, gfp_t mask)
5293 {
5294 struct extent_map *em;
5295 u64 start = page_offset(page);
5296 u64 end = start + PAGE_SIZE - 1;
5297 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5298 struct extent_io_tree *tree = &btrfs_inode->io_tree;
5299 struct extent_map_tree *map = &btrfs_inode->extent_tree;
5300
5301 if (gfpflags_allow_blocking(mask) &&
5302 page->mapping->host->i_size > SZ_16M) {
5303 u64 len;
5304 while (start <= end) {
5305 struct btrfs_fs_info *fs_info;
5306 u64 cur_gen;
5307
5308 len = end - start + 1;
5309 write_lock(&map->lock);
5310 em = lookup_extent_mapping(map, start, len);
5311 if (!em) {
5312 write_unlock(&map->lock);
5313 break;
5314 }
5315 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5316 em->start != start) {
5317 write_unlock(&map->lock);
5318 free_extent_map(em);
5319 break;
5320 }
5321 if (test_range_bit(tree, em->start,
5322 extent_map_end(em) - 1,
5323 EXTENT_LOCKED, 0, NULL))
5324 goto next;
5325
5326
5327
5328
5329
5330
5331 if (list_empty(&em->list) ||
5332 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5333 goto remove_em;
5334
5335
5336
5337
5338
5339
5340
5341 fs_info = btrfs_inode->root->fs_info;
5342 spin_lock(&fs_info->trans_lock);
5343 cur_gen = fs_info->generation;
5344 spin_unlock(&fs_info->trans_lock);
5345 if (em->generation >= cur_gen)
5346 goto next;
5347 remove_em:
5348
5349
5350
5351
5352
5353
5354
5355
5356 remove_extent_mapping(map, em);
5357
5358 free_extent_map(em);
5359 next:
5360 start = extent_map_end(em);
5361 write_unlock(&map->lock);
5362
5363
5364 free_extent_map(em);
5365
5366 cond_resched();
5367 }
5368 }
5369 return try_release_extent_state(tree, page, mask);
5370 }
5371
5372
5373
5374
5375
5376 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
5377 u64 offset, u64 last)
5378 {
5379 u64 sectorsize = btrfs_inode_sectorsize(inode);
5380 struct extent_map *em;
5381 u64 len;
5382
5383 if (offset >= last)
5384 return NULL;
5385
5386 while (1) {
5387 len = last - offset;
5388 if (len == 0)
5389 break;
5390 len = ALIGN(len, sectorsize);
5391 em = btrfs_get_extent_fiemap(inode, offset, len);
5392 if (IS_ERR(em))
5393 return em;
5394
5395
5396 if (em->block_start != EXTENT_MAP_HOLE)
5397 return em;
5398
5399
5400 offset = extent_map_end(em);
5401 free_extent_map(em);
5402 if (offset >= last)
5403 break;
5404 }
5405 return NULL;
5406 }
5407
5408
5409
5410
5411
5412
5413 struct fiemap_cache {
5414 u64 offset;
5415 u64 phys;
5416 u64 len;
5417 u32 flags;
5418 bool cached;
5419 };
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5432 struct fiemap_cache *cache,
5433 u64 offset, u64 phys, u64 len, u32 flags)
5434 {
5435 int ret = 0;
5436
5437 if (!cache->cached)
5438 goto assign;
5439
5440
5441
5442
5443
5444
5445
5446
5447 if (cache->offset + cache->len > offset) {
5448 WARN_ON(1);
5449 return -EINVAL;
5450 }
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463 if (cache->offset + cache->len == offset &&
5464 cache->phys + cache->len == phys &&
5465 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5466 (flags & ~FIEMAP_EXTENT_LAST)) {
5467 cache->len += len;
5468 cache->flags |= flags;
5469 goto try_submit_last;
5470 }
5471
5472
5473 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5474 cache->len, cache->flags);
5475 cache->cached = false;
5476 if (ret)
5477 return ret;
5478 assign:
5479 cache->cached = true;
5480 cache->offset = offset;
5481 cache->phys = phys;
5482 cache->len = len;
5483 cache->flags = flags;
5484 try_submit_last:
5485 if (cache->flags & FIEMAP_EXTENT_LAST) {
5486 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5487 cache->phys, cache->len, cache->flags);
5488 cache->cached = false;
5489 }
5490 return ret;
5491 }
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
5505 struct fiemap_cache *cache)
5506 {
5507 int ret;
5508
5509 if (!cache->cached)
5510 return 0;
5511
5512 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5513 cache->len, cache->flags);
5514 cache->cached = false;
5515 if (ret > 0)
5516 ret = 0;
5517 return ret;
5518 }
5519
5520 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
5521 u64 start, u64 len)
5522 {
5523 int ret = 0;
5524 u64 off;
5525 u64 max = start + len;
5526 u32 flags = 0;
5527 u32 found_type;
5528 u64 last;
5529 u64 last_for_get_extent = 0;
5530 u64 disko = 0;
5531 u64 isize = i_size_read(&inode->vfs_inode);
5532 struct btrfs_key found_key;
5533 struct extent_map *em = NULL;
5534 struct extent_state *cached_state = NULL;
5535 struct btrfs_path *path;
5536 struct btrfs_root *root = inode->root;
5537 struct fiemap_cache cache = { 0 };
5538 struct ulist *roots;
5539 struct ulist *tmp_ulist;
5540 int end = 0;
5541 u64 em_start = 0;
5542 u64 em_len = 0;
5543 u64 em_end = 0;
5544
5545 if (len == 0)
5546 return -EINVAL;
5547
5548 path = btrfs_alloc_path();
5549 if (!path)
5550 return -ENOMEM;
5551
5552 roots = ulist_alloc(GFP_KERNEL);
5553 tmp_ulist = ulist_alloc(GFP_KERNEL);
5554 if (!roots || !tmp_ulist) {
5555 ret = -ENOMEM;
5556 goto out_free_ulist;
5557 }
5558
5559
5560
5561
5562
5563 off = 0;
5564 start = round_down(start, btrfs_inode_sectorsize(inode));
5565 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
5566
5567
5568
5569
5570
5571 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5572 0);
5573 if (ret < 0) {
5574 goto out_free_ulist;
5575 } else {
5576 WARN_ON(!ret);
5577 if (ret == 1)
5578 ret = 0;
5579 }
5580
5581 path->slots[0]--;
5582 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5583 found_type = found_key.type;
5584
5585
5586 if (found_key.objectid != btrfs_ino(inode) ||
5587 found_type != BTRFS_EXTENT_DATA_KEY) {
5588
5589 last = (u64)-1;
5590 last_for_get_extent = isize;
5591 } else {
5592
5593
5594
5595
5596
5597 last = found_key.offset;
5598 last_for_get_extent = last + 1;
5599 }
5600 btrfs_release_path(path);
5601
5602
5603
5604
5605
5606
5607 if (last < isize) {
5608 last = (u64)-1;
5609 last_for_get_extent = isize;
5610 }
5611
5612 lock_extent_bits(&inode->io_tree, start, start + len - 1,
5613 &cached_state);
5614
5615 em = get_extent_skip_holes(inode, start, last_for_get_extent);
5616 if (!em)
5617 goto out;
5618 if (IS_ERR(em)) {
5619 ret = PTR_ERR(em);
5620 goto out;
5621 }
5622
5623 while (!end) {
5624 u64 offset_in_extent = 0;
5625
5626
5627 if (em->start >= max || extent_map_end(em) < off)
5628 break;
5629
5630
5631
5632
5633
5634
5635
5636 em_start = max(em->start, off);
5637
5638
5639
5640
5641
5642
5643
5644 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5645 offset_in_extent = em_start - em->start;
5646 em_end = extent_map_end(em);
5647 em_len = em_end - em_start;
5648 flags = 0;
5649 if (em->block_start < EXTENT_MAP_LAST_BYTE)
5650 disko = em->block_start + offset_in_extent;
5651 else
5652 disko = 0;
5653
5654
5655
5656
5657 off = extent_map_end(em);
5658 if (off >= max)
5659 end = 1;
5660
5661 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
5662 end = 1;
5663 flags |= FIEMAP_EXTENT_LAST;
5664 } else if (em->block_start == EXTENT_MAP_INLINE) {
5665 flags |= (FIEMAP_EXTENT_DATA_INLINE |
5666 FIEMAP_EXTENT_NOT_ALIGNED);
5667 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
5668 flags |= (FIEMAP_EXTENT_DELALLOC |
5669 FIEMAP_EXTENT_UNKNOWN);
5670 } else if (fieinfo->fi_extents_max) {
5671 u64 bytenr = em->block_start -
5672 (em->start - em->orig_start);
5673
5674
5675
5676
5677
5678
5679
5680
5681 ret = btrfs_check_shared(root, btrfs_ino(inode),
5682 bytenr, roots, tmp_ulist);
5683 if (ret < 0)
5684 goto out_free;
5685 if (ret)
5686 flags |= FIEMAP_EXTENT_SHARED;
5687 ret = 0;
5688 }
5689 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5690 flags |= FIEMAP_EXTENT_ENCODED;
5691 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5692 flags |= FIEMAP_EXTENT_UNWRITTEN;
5693
5694 free_extent_map(em);
5695 em = NULL;
5696 if ((em_start >= last) || em_len == (u64)-1 ||
5697 (last == (u64)-1 && isize <= em_end)) {
5698 flags |= FIEMAP_EXTENT_LAST;
5699 end = 1;
5700 }
5701
5702
5703 em = get_extent_skip_holes(inode, off, last_for_get_extent);
5704 if (IS_ERR(em)) {
5705 ret = PTR_ERR(em);
5706 goto out;
5707 }
5708 if (!em) {
5709 flags |= FIEMAP_EXTENT_LAST;
5710 end = 1;
5711 }
5712 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5713 em_len, flags);
5714 if (ret) {
5715 if (ret == 1)
5716 ret = 0;
5717 goto out_free;
5718 }
5719 }
5720 out_free:
5721 if (!ret)
5722 ret = emit_last_fiemap_cache(fieinfo, &cache);
5723 free_extent_map(em);
5724 out:
5725 unlock_extent_cached(&inode->io_tree, start, start + len - 1,
5726 &cached_state);
5727
5728 out_free_ulist:
5729 btrfs_free_path(path);
5730 ulist_free(roots);
5731 ulist_free(tmp_ulist);
5732 return ret;
5733 }
5734
5735 static void __free_extent_buffer(struct extent_buffer *eb)
5736 {
5737 kmem_cache_free(extent_buffer_cache, eb);
5738 }
5739
5740 int extent_buffer_under_io(const struct extent_buffer *eb)
5741 {
5742 return (atomic_read(&eb->io_pages) ||
5743 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5744 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5745 }
5746
5747 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
5748 {
5749 struct btrfs_subpage *subpage;
5750
5751 lockdep_assert_held(&page->mapping->private_lock);
5752
5753 if (PagePrivate(page)) {
5754 subpage = (struct btrfs_subpage *)page->private;
5755 if (atomic_read(&subpage->eb_refs))
5756 return true;
5757
5758
5759
5760
5761 if (atomic_read(&subpage->readers))
5762 return true;
5763 }
5764 return false;
5765 }
5766
5767 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5768 {
5769 struct btrfs_fs_info *fs_info = eb->fs_info;
5770 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5771
5772
5773
5774
5775
5776 if (mapped)
5777 spin_lock(&page->mapping->private_lock);
5778
5779 if (!PagePrivate(page)) {
5780 if (mapped)
5781 spin_unlock(&page->mapping->private_lock);
5782 return;
5783 }
5784
5785 if (fs_info->nodesize >= PAGE_SIZE) {
5786
5787
5788
5789
5790
5791
5792
5793 if (PagePrivate(page) &&
5794 page->private == (unsigned long)eb) {
5795 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5796 BUG_ON(PageDirty(page));
5797 BUG_ON(PageWriteback(page));
5798
5799
5800
5801
5802 detach_page_private(page);
5803 }
5804 if (mapped)
5805 spin_unlock(&page->mapping->private_lock);
5806 return;
5807 }
5808
5809
5810
5811
5812
5813
5814 if (!mapped) {
5815 btrfs_detach_subpage(fs_info, page);
5816 return;
5817 }
5818
5819 btrfs_page_dec_eb_refs(fs_info, page);
5820
5821
5822
5823
5824
5825 if (!page_range_has_eb(fs_info, page))
5826 btrfs_detach_subpage(fs_info, page);
5827
5828 spin_unlock(&page->mapping->private_lock);
5829 }
5830
5831
5832 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5833 {
5834 int i;
5835 int num_pages;
5836
5837 ASSERT(!extent_buffer_under_io(eb));
5838
5839 num_pages = num_extent_pages(eb);
5840 for (i = 0; i < num_pages; i++) {
5841 struct page *page = eb->pages[i];
5842
5843 if (!page)
5844 continue;
5845
5846 detach_extent_buffer_page(eb, page);
5847
5848
5849 put_page(page);
5850 }
5851 }
5852
5853
5854
5855
5856 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5857 {
5858 btrfs_release_extent_buffer_pages(eb);
5859 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5860 __free_extent_buffer(eb);
5861 }
5862
5863 static struct extent_buffer *
5864 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
5865 unsigned long len)
5866 {
5867 struct extent_buffer *eb = NULL;
5868
5869 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
5870 eb->start = start;
5871 eb->len = len;
5872 eb->fs_info = fs_info;
5873 eb->bflags = 0;
5874 init_rwsem(&eb->lock);
5875
5876 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5877 &fs_info->allocated_ebs);
5878 INIT_LIST_HEAD(&eb->release_list);
5879
5880 spin_lock_init(&eb->refs_lock);
5881 atomic_set(&eb->refs, 1);
5882 atomic_set(&eb->io_pages, 0);
5883
5884 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
5885
5886 return eb;
5887 }
5888
5889 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
5890 {
5891 int i;
5892 struct extent_buffer *new;
5893 int num_pages = num_extent_pages(src);
5894 int ret;
5895
5896 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
5897 if (new == NULL)
5898 return NULL;
5899
5900
5901
5902
5903
5904
5905 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5906
5907 memset(new->pages, 0, sizeof(*new->pages) * num_pages);
5908 ret = btrfs_alloc_page_array(num_pages, new->pages);
5909 if (ret) {
5910 btrfs_release_extent_buffer(new);
5911 return NULL;
5912 }
5913
5914 for (i = 0; i < num_pages; i++) {
5915 int ret;
5916 struct page *p = new->pages[i];
5917
5918 ret = attach_extent_buffer_page(new, p, NULL);
5919 if (ret < 0) {
5920 btrfs_release_extent_buffer(new);
5921 return NULL;
5922 }
5923 WARN_ON(PageDirty(p));
5924 copy_page(page_address(p), page_address(src->pages[i]));
5925 }
5926 set_extent_buffer_uptodate(new);
5927
5928 return new;
5929 }
5930
5931 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5932 u64 start, unsigned long len)
5933 {
5934 struct extent_buffer *eb;
5935 int num_pages;
5936 int i;
5937 int ret;
5938
5939 eb = __alloc_extent_buffer(fs_info, start, len);
5940 if (!eb)
5941 return NULL;
5942
5943 num_pages = num_extent_pages(eb);
5944 ret = btrfs_alloc_page_array(num_pages, eb->pages);
5945 if (ret)
5946 goto err;
5947
5948 for (i = 0; i < num_pages; i++) {
5949 struct page *p = eb->pages[i];
5950
5951 ret = attach_extent_buffer_page(eb, p, NULL);
5952 if (ret < 0)
5953 goto err;
5954 }
5955
5956 set_extent_buffer_uptodate(eb);
5957 btrfs_set_header_nritems(eb, 0);
5958 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5959
5960 return eb;
5961 err:
5962 for (i = 0; i < num_pages; i++) {
5963 if (eb->pages[i]) {
5964 detach_extent_buffer_page(eb, eb->pages[i]);
5965 __free_page(eb->pages[i]);
5966 }
5967 }
5968 __free_extent_buffer(eb);
5969 return NULL;
5970 }
5971
5972 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5973 u64 start)
5974 {
5975 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
5976 }
5977
5978 static void check_buffer_tree_ref(struct extent_buffer *eb)
5979 {
5980 int refs;
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004 refs = atomic_read(&eb->refs);
6005 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6006 return;
6007
6008 spin_lock(&eb->refs_lock);
6009 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6010 atomic_inc(&eb->refs);
6011 spin_unlock(&eb->refs_lock);
6012 }
6013
6014 static void mark_extent_buffer_accessed(struct extent_buffer *eb,
6015 struct page *accessed)
6016 {
6017 int num_pages, i;
6018
6019 check_buffer_tree_ref(eb);
6020
6021 num_pages = num_extent_pages(eb);
6022 for (i = 0; i < num_pages; i++) {
6023 struct page *p = eb->pages[i];
6024
6025 if (p != accessed)
6026 mark_page_accessed(p);
6027 }
6028 }
6029
6030 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
6031 u64 start)
6032 {
6033 struct extent_buffer *eb;
6034
6035 eb = find_extent_buffer_nolock(fs_info, start);
6036 if (!eb)
6037 return NULL;
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
6052 spin_lock(&eb->refs_lock);
6053 spin_unlock(&eb->refs_lock);
6054 }
6055 mark_extent_buffer_accessed(eb, NULL);
6056 return eb;
6057 }
6058
6059 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6060 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
6061 u64 start)
6062 {
6063 struct extent_buffer *eb, *exists = NULL;
6064 int ret;
6065
6066 eb = find_extent_buffer(fs_info, start);
6067 if (eb)
6068 return eb;
6069 eb = alloc_dummy_extent_buffer(fs_info, start);
6070 if (!eb)
6071 return ERR_PTR(-ENOMEM);
6072 eb->fs_info = fs_info;
6073 again:
6074 ret = radix_tree_preload(GFP_NOFS);
6075 if (ret) {
6076 exists = ERR_PTR(ret);
6077 goto free_eb;
6078 }
6079 spin_lock(&fs_info->buffer_lock);
6080 ret = radix_tree_insert(&fs_info->buffer_radix,
6081 start >> fs_info->sectorsize_bits, eb);
6082 spin_unlock(&fs_info->buffer_lock);
6083 radix_tree_preload_end();
6084 if (ret == -EEXIST) {
6085 exists = find_extent_buffer(fs_info, start);
6086 if (exists)
6087 goto free_eb;
6088 else
6089 goto again;
6090 }
6091 check_buffer_tree_ref(eb);
6092 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6093
6094 return eb;
6095 free_eb:
6096 btrfs_release_extent_buffer(eb);
6097 return exists;
6098 }
6099 #endif
6100
6101 static struct extent_buffer *grab_extent_buffer(
6102 struct btrfs_fs_info *fs_info, struct page *page)
6103 {
6104 struct extent_buffer *exists;
6105
6106
6107
6108
6109
6110
6111 if (fs_info->nodesize < PAGE_SIZE)
6112 return NULL;
6113
6114
6115 if (!PagePrivate(page))
6116 return NULL;
6117
6118
6119
6120
6121
6122
6123
6124 exists = (struct extent_buffer *)page->private;
6125 if (atomic_inc_not_zero(&exists->refs))
6126 return exists;
6127
6128 WARN_ON(PageDirty(page));
6129 detach_page_private(page);
6130 return NULL;
6131 }
6132
6133 static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
6134 {
6135 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
6136 btrfs_err(fs_info, "bad tree block start %llu", start);
6137 return -EINVAL;
6138 }
6139
6140 if (fs_info->nodesize < PAGE_SIZE &&
6141 offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
6142 btrfs_err(fs_info,
6143 "tree block crosses page boundary, start %llu nodesize %u",
6144 start, fs_info->nodesize);
6145 return -EINVAL;
6146 }
6147 if (fs_info->nodesize >= PAGE_SIZE &&
6148 !PAGE_ALIGNED(start)) {
6149 btrfs_err(fs_info,
6150 "tree block is not page aligned, start %llu nodesize %u",
6151 start, fs_info->nodesize);
6152 return -EINVAL;
6153 }
6154 return 0;
6155 }
6156
6157 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
6158 u64 start, u64 owner_root, int level)
6159 {
6160 unsigned long len = fs_info->nodesize;
6161 int num_pages;
6162 int i;
6163 unsigned long index = start >> PAGE_SHIFT;
6164 struct extent_buffer *eb;
6165 struct extent_buffer *exists = NULL;
6166 struct page *p;
6167 struct address_space *mapping = fs_info->btree_inode->i_mapping;
6168 u64 lockdep_owner = owner_root;
6169 int uptodate = 1;
6170 int ret;
6171
6172 if (check_eb_alignment(fs_info, start))
6173 return ERR_PTR(-EINVAL);
6174
6175 #if BITS_PER_LONG == 32
6176 if (start >= MAX_LFS_FILESIZE) {
6177 btrfs_err_rl(fs_info,
6178 "extent buffer %llu is beyond 32bit page cache limit", start);
6179 btrfs_err_32bit_limit(fs_info);
6180 return ERR_PTR(-EOVERFLOW);
6181 }
6182 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6183 btrfs_warn_32bit_limit(fs_info);
6184 #endif
6185
6186 eb = find_extent_buffer(fs_info, start);
6187 if (eb)
6188 return eb;
6189
6190 eb = __alloc_extent_buffer(fs_info, start, len);
6191 if (!eb)
6192 return ERR_PTR(-ENOMEM);
6193
6194
6195
6196
6197
6198 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
6199 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
6200
6201 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
6202
6203 num_pages = num_extent_pages(eb);
6204 for (i = 0; i < num_pages; i++, index++) {
6205 struct btrfs_subpage *prealloc = NULL;
6206
6207 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
6208 if (!p) {
6209 exists = ERR_PTR(-ENOMEM);
6210 goto free_eb;
6211 }
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223 if (fs_info->nodesize < PAGE_SIZE) {
6224 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
6225 if (IS_ERR(prealloc)) {
6226 ret = PTR_ERR(prealloc);
6227 unlock_page(p);
6228 put_page(p);
6229 exists = ERR_PTR(ret);
6230 goto free_eb;
6231 }
6232 }
6233
6234 spin_lock(&mapping->private_lock);
6235 exists = grab_extent_buffer(fs_info, p);
6236 if (exists) {
6237 spin_unlock(&mapping->private_lock);
6238 unlock_page(p);
6239 put_page(p);
6240 mark_extent_buffer_accessed(exists, p);
6241 btrfs_free_subpage(prealloc);
6242 goto free_eb;
6243 }
6244
6245 ret = attach_extent_buffer_page(eb, p, prealloc);
6246 ASSERT(!ret);
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256 btrfs_page_inc_eb_refs(fs_info, p);
6257 spin_unlock(&mapping->private_lock);
6258
6259 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
6260 eb->pages[i] = p;
6261 if (!PageUptodate(p))
6262 uptodate = 0;
6263
6264
6265
6266
6267
6268
6269
6270
6271 }
6272 if (uptodate)
6273 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6274 again:
6275 ret = radix_tree_preload(GFP_NOFS);
6276 if (ret) {
6277 exists = ERR_PTR(ret);
6278 goto free_eb;
6279 }
6280
6281 spin_lock(&fs_info->buffer_lock);
6282 ret = radix_tree_insert(&fs_info->buffer_radix,
6283 start >> fs_info->sectorsize_bits, eb);
6284 spin_unlock(&fs_info->buffer_lock);
6285 radix_tree_preload_end();
6286 if (ret == -EEXIST) {
6287 exists = find_extent_buffer(fs_info, start);
6288 if (exists)
6289 goto free_eb;
6290 else
6291 goto again;
6292 }
6293
6294 check_buffer_tree_ref(eb);
6295 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6296
6297
6298
6299
6300
6301
6302 for (i = 0; i < num_pages; i++)
6303 unlock_page(eb->pages[i]);
6304 return eb;
6305
6306 free_eb:
6307 WARN_ON(!atomic_dec_and_test(&eb->refs));
6308 for (i = 0; i < num_pages; i++) {
6309 if (eb->pages[i])
6310 unlock_page(eb->pages[i]);
6311 }
6312
6313 btrfs_release_extent_buffer(eb);
6314 return exists;
6315 }
6316
6317 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6318 {
6319 struct extent_buffer *eb =
6320 container_of(head, struct extent_buffer, rcu_head);
6321
6322 __free_extent_buffer(eb);
6323 }
6324
6325 static int release_extent_buffer(struct extent_buffer *eb)
6326 __releases(&eb->refs_lock)
6327 {
6328 lockdep_assert_held(&eb->refs_lock);
6329
6330 WARN_ON(atomic_read(&eb->refs) == 0);
6331 if (atomic_dec_and_test(&eb->refs)) {
6332 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
6333 struct btrfs_fs_info *fs_info = eb->fs_info;
6334
6335 spin_unlock(&eb->refs_lock);
6336
6337 spin_lock(&fs_info->buffer_lock);
6338 radix_tree_delete(&fs_info->buffer_radix,
6339 eb->start >> fs_info->sectorsize_bits);
6340 spin_unlock(&fs_info->buffer_lock);
6341 } else {
6342 spin_unlock(&eb->refs_lock);
6343 }
6344
6345 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
6346
6347 btrfs_release_extent_buffer_pages(eb);
6348 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6349 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
6350 __free_extent_buffer(eb);
6351 return 1;
6352 }
6353 #endif
6354 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
6355 return 1;
6356 }
6357 spin_unlock(&eb->refs_lock);
6358
6359 return 0;
6360 }
6361
6362 void free_extent_buffer(struct extent_buffer *eb)
6363 {
6364 int refs;
6365 int old;
6366 if (!eb)
6367 return;
6368
6369 while (1) {
6370 refs = atomic_read(&eb->refs);
6371 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6372 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6373 refs == 1))
6374 break;
6375 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6376 if (old == refs)
6377 return;
6378 }
6379
6380 spin_lock(&eb->refs_lock);
6381 if (atomic_read(&eb->refs) == 2 &&
6382 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
6383 !extent_buffer_under_io(eb) &&
6384 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6385 atomic_dec(&eb->refs);
6386
6387
6388
6389
6390
6391 release_extent_buffer(eb);
6392 }
6393
6394 void free_extent_buffer_stale(struct extent_buffer *eb)
6395 {
6396 if (!eb)
6397 return;
6398
6399 spin_lock(&eb->refs_lock);
6400 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6401
6402 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
6403 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6404 atomic_dec(&eb->refs);
6405 release_extent_buffer(eb);
6406 }
6407
6408 static void btree_clear_page_dirty(struct page *page)
6409 {
6410 ASSERT(PageDirty(page));
6411 ASSERT(PageLocked(page));
6412 clear_page_dirty_for_io(page);
6413 xa_lock_irq(&page->mapping->i_pages);
6414 if (!PageDirty(page))
6415 __xa_clear_mark(&page->mapping->i_pages,
6416 page_index(page), PAGECACHE_TAG_DIRTY);
6417 xa_unlock_irq(&page->mapping->i_pages);
6418 }
6419
6420 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6421 {
6422 struct btrfs_fs_info *fs_info = eb->fs_info;
6423 struct page *page = eb->pages[0];
6424 bool last;
6425
6426
6427 lock_page(page);
6428 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6429 eb->len);
6430 if (last)
6431 btree_clear_page_dirty(page);
6432 unlock_page(page);
6433 WARN_ON(atomic_read(&eb->refs) == 0);
6434 }
6435
6436 void clear_extent_buffer_dirty(const struct extent_buffer *eb)
6437 {
6438 int i;
6439 int num_pages;
6440 struct page *page;
6441
6442 if (eb->fs_info->nodesize < PAGE_SIZE)
6443 return clear_subpage_extent_buffer_dirty(eb);
6444
6445 num_pages = num_extent_pages(eb);
6446
6447 for (i = 0; i < num_pages; i++) {
6448 page = eb->pages[i];
6449 if (!PageDirty(page))
6450 continue;
6451 lock_page(page);
6452 btree_clear_page_dirty(page);
6453 ClearPageError(page);
6454 unlock_page(page);
6455 }
6456 WARN_ON(atomic_read(&eb->refs) == 0);
6457 }
6458
6459 bool set_extent_buffer_dirty(struct extent_buffer *eb)
6460 {
6461 int i;
6462 int num_pages;
6463 bool was_dirty;
6464
6465 check_buffer_tree_ref(eb);
6466
6467 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
6468
6469 num_pages = num_extent_pages(eb);
6470 WARN_ON(atomic_read(&eb->refs) == 0);
6471 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6472
6473 if (!was_dirty) {
6474 bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487 if (subpage)
6488 lock_page(eb->pages[0]);
6489 for (i = 0; i < num_pages; i++)
6490 btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6491 eb->start, eb->len);
6492 if (subpage)
6493 unlock_page(eb->pages[0]);
6494 }
6495 #ifdef CONFIG_BTRFS_DEBUG
6496 for (i = 0; i < num_pages; i++)
6497 ASSERT(PageDirty(eb->pages[i]));
6498 #endif
6499
6500 return was_dirty;
6501 }
6502
6503 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
6504 {
6505 struct btrfs_fs_info *fs_info = eb->fs_info;
6506 struct page *page;
6507 int num_pages;
6508 int i;
6509
6510 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6511 num_pages = num_extent_pages(eb);
6512 for (i = 0; i < num_pages; i++) {
6513 page = eb->pages[i];
6514 if (!page)
6515 continue;
6516
6517
6518
6519
6520
6521 if (fs_info->nodesize >= PAGE_SIZE)
6522 ClearPageUptodate(page);
6523 else
6524 btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
6525 eb->len);
6526 }
6527 }
6528
6529 void set_extent_buffer_uptodate(struct extent_buffer *eb)
6530 {
6531 struct btrfs_fs_info *fs_info = eb->fs_info;
6532 struct page *page;
6533 int num_pages;
6534 int i;
6535
6536 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6537 num_pages = num_extent_pages(eb);
6538 for (i = 0; i < num_pages; i++) {
6539 page = eb->pages[i];
6540
6541
6542
6543
6544
6545 if (fs_info->nodesize >= PAGE_SIZE)
6546 SetPageUptodate(page);
6547 else
6548 btrfs_subpage_set_uptodate(fs_info, page, eb->start,
6549 eb->len);
6550 }
6551 }
6552
6553 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6554 int mirror_num)
6555 {
6556 struct btrfs_fs_info *fs_info = eb->fs_info;
6557 struct extent_io_tree *io_tree;
6558 struct page *page = eb->pages[0];
6559 struct btrfs_bio_ctrl bio_ctrl = {
6560 .mirror_num = mirror_num,
6561 };
6562 int ret = 0;
6563
6564 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6565 ASSERT(PagePrivate(page));
6566 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6567
6568 if (wait == WAIT_NONE) {
6569 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6570 return -EAGAIN;
6571 } else {
6572 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6573 if (ret < 0)
6574 return ret;
6575 }
6576
6577 ret = 0;
6578 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6579 PageUptodate(page) ||
6580 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6581 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6582 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6583 return ret;
6584 }
6585
6586 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6587 eb->read_mirror = 0;
6588 atomic_set(&eb->io_pages, 1);
6589 check_buffer_tree_ref(eb);
6590 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6591
6592 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
6593 ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
6594 page, eb->start, eb->len,
6595 eb->start - page_offset(page),
6596 end_bio_extent_readpage, 0, true);
6597 if (ret) {
6598
6599
6600
6601
6602
6603 atomic_dec(&eb->io_pages);
6604 }
6605 submit_one_bio(&bio_ctrl);
6606 if (ret || wait != WAIT_COMPLETE)
6607 return ret;
6608
6609 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6610 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6611 ret = -EIO;
6612 return ret;
6613 }
6614
6615 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
6616 {
6617 int i;
6618 struct page *page;
6619 int err;
6620 int ret = 0;
6621 int locked_pages = 0;
6622 int all_uptodate = 1;
6623 int num_pages;
6624 unsigned long num_reads = 0;
6625 struct btrfs_bio_ctrl bio_ctrl = {
6626 .mirror_num = mirror_num,
6627 };
6628
6629 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6630 return 0;
6631
6632
6633
6634
6635
6636
6637 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
6638 return -EIO;
6639
6640 if (eb->fs_info->nodesize < PAGE_SIZE)
6641 return read_extent_buffer_subpage(eb, wait, mirror_num);
6642
6643 num_pages = num_extent_pages(eb);
6644 for (i = 0; i < num_pages; i++) {
6645 page = eb->pages[i];
6646 if (wait == WAIT_NONE) {
6647
6648
6649
6650
6651
6652
6653
6654 if (!trylock_page(page))
6655 goto unlock_exit;
6656 } else {
6657 lock_page(page);
6658 }
6659 locked_pages++;
6660 }
6661
6662
6663
6664
6665
6666 for (i = 0; i < num_pages; i++) {
6667 page = eb->pages[i];
6668 if (!PageUptodate(page)) {
6669 num_reads++;
6670 all_uptodate = 0;
6671 }
6672 }
6673
6674 if (all_uptodate) {
6675 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6676 goto unlock_exit;
6677 }
6678
6679 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6680 eb->read_mirror = 0;
6681 atomic_set(&eb->io_pages, num_reads);
6682
6683
6684
6685
6686 check_buffer_tree_ref(eb);
6687 for (i = 0; i < num_pages; i++) {
6688 page = eb->pages[i];
6689
6690 if (!PageUptodate(page)) {
6691 if (ret) {
6692 atomic_dec(&eb->io_pages);
6693 unlock_page(page);
6694 continue;
6695 }
6696
6697 ClearPageError(page);
6698 err = submit_extent_page(REQ_OP_READ, NULL,
6699 &bio_ctrl, page, page_offset(page),
6700 PAGE_SIZE, 0, end_bio_extent_readpage,
6701 0, false);
6702 if (err) {
6703
6704
6705
6706
6707
6708 ret = err;
6709 SetPageError(page);
6710 unlock_page(page);
6711 atomic_dec(&eb->io_pages);
6712 }
6713 } else {
6714 unlock_page(page);
6715 }
6716 }
6717
6718 submit_one_bio(&bio_ctrl);
6719
6720 if (ret || wait != WAIT_COMPLETE)
6721 return ret;
6722
6723 for (i = 0; i < num_pages; i++) {
6724 page = eb->pages[i];
6725 wait_on_page_locked(page);
6726 if (!PageUptodate(page))
6727 ret = -EIO;
6728 }
6729
6730 return ret;
6731
6732 unlock_exit:
6733 while (locked_pages > 0) {
6734 locked_pages--;
6735 page = eb->pages[locked_pages];
6736 unlock_page(page);
6737 }
6738 return ret;
6739 }
6740
6741 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6742 unsigned long len)
6743 {
6744 btrfs_warn(eb->fs_info,
6745 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
6746 eb->start, eb->len, start, len);
6747 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6748
6749 return true;
6750 }
6751
6752
6753
6754
6755
6756
6757
6758
6759 static inline int check_eb_range(const struct extent_buffer *eb,
6760 unsigned long start, unsigned long len)
6761 {
6762 unsigned long offset;
6763
6764
6765 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6766 return report_eb_range(eb, start, len);
6767
6768 return false;
6769 }
6770
6771 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6772 unsigned long start, unsigned long len)
6773 {
6774 size_t cur;
6775 size_t offset;
6776 struct page *page;
6777 char *kaddr;
6778 char *dst = (char *)dstv;
6779 unsigned long i = get_eb_page_index(start);
6780
6781 if (check_eb_range(eb, start, len))
6782 return;
6783
6784 offset = get_eb_offset_in_page(eb, start);
6785
6786 while (len > 0) {
6787 page = eb->pages[i];
6788
6789 cur = min(len, (PAGE_SIZE - offset));
6790 kaddr = page_address(page);
6791 memcpy(dst, kaddr + offset, cur);
6792
6793 dst += cur;
6794 len -= cur;
6795 offset = 0;
6796 i++;
6797 }
6798 }
6799
6800 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6801 void __user *dstv,
6802 unsigned long start, unsigned long len)
6803 {
6804 size_t cur;
6805 size_t offset;
6806 struct page *page;
6807 char *kaddr;
6808 char __user *dst = (char __user *)dstv;
6809 unsigned long i = get_eb_page_index(start);
6810 int ret = 0;
6811
6812 WARN_ON(start > eb->len);
6813 WARN_ON(start + len > eb->start + eb->len);
6814
6815 offset = get_eb_offset_in_page(eb, start);
6816
6817 while (len > 0) {
6818 page = eb->pages[i];
6819
6820 cur = min(len, (PAGE_SIZE - offset));
6821 kaddr = page_address(page);
6822 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
6823 ret = -EFAULT;
6824 break;
6825 }
6826
6827 dst += cur;
6828 len -= cur;
6829 offset = 0;
6830 i++;
6831 }
6832
6833 return ret;
6834 }
6835
6836 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6837 unsigned long start, unsigned long len)
6838 {
6839 size_t cur;
6840 size_t offset;
6841 struct page *page;
6842 char *kaddr;
6843 char *ptr = (char *)ptrv;
6844 unsigned long i = get_eb_page_index(start);
6845 int ret = 0;
6846
6847 if (check_eb_range(eb, start, len))
6848 return -EINVAL;
6849
6850 offset = get_eb_offset_in_page(eb, start);
6851
6852 while (len > 0) {
6853 page = eb->pages[i];
6854
6855 cur = min(len, (PAGE_SIZE - offset));
6856
6857 kaddr = page_address(page);
6858 ret = memcmp(ptr, kaddr + offset, cur);
6859 if (ret)
6860 break;
6861
6862 ptr += cur;
6863 len -= cur;
6864 offset = 0;
6865 i++;
6866 }
6867 return ret;
6868 }
6869
6870
6871
6872
6873
6874
6875
6876 static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6877 struct page *page)
6878 {
6879 struct btrfs_fs_info *fs_info = eb->fs_info;
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890 if (fs_info->nodesize < PAGE_SIZE) {
6891 bool uptodate, error;
6892
6893 uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6894 eb->start, eb->len);
6895 error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
6896 WARN_ON(!uptodate && !error);
6897 } else {
6898 WARN_ON(!PageUptodate(page) && !PageError(page));
6899 }
6900 }
6901
6902 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
6903 const void *srcv)
6904 {
6905 char *kaddr;
6906
6907 assert_eb_page_uptodate(eb, eb->pages[0]);
6908 kaddr = page_address(eb->pages[0]) +
6909 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6910 chunk_tree_uuid));
6911 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6912 }
6913
6914 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
6915 {
6916 char *kaddr;
6917
6918 assert_eb_page_uptodate(eb, eb->pages[0]);
6919 kaddr = page_address(eb->pages[0]) +
6920 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6921 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6922 }
6923
6924 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
6925 unsigned long start, unsigned long len)
6926 {
6927 size_t cur;
6928 size_t offset;
6929 struct page *page;
6930 char *kaddr;
6931 char *src = (char *)srcv;
6932 unsigned long i = get_eb_page_index(start);
6933
6934 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6935
6936 if (check_eb_range(eb, start, len))
6937 return;
6938
6939 offset = get_eb_offset_in_page(eb, start);
6940
6941 while (len > 0) {
6942 page = eb->pages[i];
6943 assert_eb_page_uptodate(eb, page);
6944
6945 cur = min(len, PAGE_SIZE - offset);
6946 kaddr = page_address(page);
6947 memcpy(kaddr + offset, src, cur);
6948
6949 src += cur;
6950 len -= cur;
6951 offset = 0;
6952 i++;
6953 }
6954 }
6955
6956 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
6957 unsigned long len)
6958 {
6959 size_t cur;
6960 size_t offset;
6961 struct page *page;
6962 char *kaddr;
6963 unsigned long i = get_eb_page_index(start);
6964
6965 if (check_eb_range(eb, start, len))
6966 return;
6967
6968 offset = get_eb_offset_in_page(eb, start);
6969
6970 while (len > 0) {
6971 page = eb->pages[i];
6972 assert_eb_page_uptodate(eb, page);
6973
6974 cur = min(len, PAGE_SIZE - offset);
6975 kaddr = page_address(page);
6976 memset(kaddr + offset, 0, cur);
6977
6978 len -= cur;
6979 offset = 0;
6980 i++;
6981 }
6982 }
6983
6984 void copy_extent_buffer_full(const struct extent_buffer *dst,
6985 const struct extent_buffer *src)
6986 {
6987 int i;
6988 int num_pages;
6989
6990 ASSERT(dst->len == src->len);
6991
6992 if (dst->fs_info->nodesize >= PAGE_SIZE) {
6993 num_pages = num_extent_pages(dst);
6994 for (i = 0; i < num_pages; i++)
6995 copy_page(page_address(dst->pages[i]),
6996 page_address(src->pages[i]));
6997 } else {
6998 size_t src_offset = get_eb_offset_in_page(src, 0);
6999 size_t dst_offset = get_eb_offset_in_page(dst, 0);
7000
7001 ASSERT(src->fs_info->nodesize < PAGE_SIZE);
7002 memcpy(page_address(dst->pages[0]) + dst_offset,
7003 page_address(src->pages[0]) + src_offset,
7004 src->len);
7005 }
7006 }
7007
7008 void copy_extent_buffer(const struct extent_buffer *dst,
7009 const struct extent_buffer *src,
7010 unsigned long dst_offset, unsigned long src_offset,
7011 unsigned long len)
7012 {
7013 u64 dst_len = dst->len;
7014 size_t cur;
7015 size_t offset;
7016 struct page *page;
7017 char *kaddr;
7018 unsigned long i = get_eb_page_index(dst_offset);
7019
7020 if (check_eb_range(dst, dst_offset, len) ||
7021 check_eb_range(src, src_offset, len))
7022 return;
7023
7024 WARN_ON(src->len != dst_len);
7025
7026 offset = get_eb_offset_in_page(dst, dst_offset);
7027
7028 while (len > 0) {
7029 page = dst->pages[i];
7030 assert_eb_page_uptodate(dst, page);
7031
7032 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
7033
7034 kaddr = page_address(page);
7035 read_extent_buffer(src, kaddr + offset, src_offset, cur);
7036
7037 src_offset += cur;
7038 len -= cur;
7039 offset = 0;
7040 i++;
7041 }
7042 }
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057 static inline void eb_bitmap_offset(const struct extent_buffer *eb,
7058 unsigned long start, unsigned long nr,
7059 unsigned long *page_index,
7060 size_t *page_offset)
7061 {
7062 size_t byte_offset = BIT_BYTE(nr);
7063 size_t offset;
7064
7065
7066
7067
7068
7069
7070 offset = start + offset_in_page(eb->start) + byte_offset;
7071
7072 *page_index = offset >> PAGE_SHIFT;
7073 *page_offset = offset_in_page(offset);
7074 }
7075
7076
7077
7078
7079
7080
7081
7082 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
7083 unsigned long nr)
7084 {
7085 u8 *kaddr;
7086 struct page *page;
7087 unsigned long i;
7088 size_t offset;
7089
7090 eb_bitmap_offset(eb, start, nr, &i, &offset);
7091 page = eb->pages[i];
7092 assert_eb_page_uptodate(eb, page);
7093 kaddr = page_address(page);
7094 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
7095 }
7096
7097
7098
7099
7100
7101
7102
7103
7104 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
7105 unsigned long pos, unsigned long len)
7106 {
7107 u8 *kaddr;
7108 struct page *page;
7109 unsigned long i;
7110 size_t offset;
7111 const unsigned int size = pos + len;
7112 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7113 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
7114
7115 eb_bitmap_offset(eb, start, pos, &i, &offset);
7116 page = eb->pages[i];
7117 assert_eb_page_uptodate(eb, page);
7118 kaddr = page_address(page);
7119
7120 while (len >= bits_to_set) {
7121 kaddr[offset] |= mask_to_set;
7122 len -= bits_to_set;
7123 bits_to_set = BITS_PER_BYTE;
7124 mask_to_set = ~0;
7125 if (++offset >= PAGE_SIZE && len > 0) {
7126 offset = 0;
7127 page = eb->pages[++i];
7128 assert_eb_page_uptodate(eb, page);
7129 kaddr = page_address(page);
7130 }
7131 }
7132 if (len) {
7133 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
7134 kaddr[offset] |= mask_to_set;
7135 }
7136 }
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
7147 unsigned long start, unsigned long pos,
7148 unsigned long len)
7149 {
7150 u8 *kaddr;
7151 struct page *page;
7152 unsigned long i;
7153 size_t offset;
7154 const unsigned int size = pos + len;
7155 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7156 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
7157
7158 eb_bitmap_offset(eb, start, pos, &i, &offset);
7159 page = eb->pages[i];
7160 assert_eb_page_uptodate(eb, page);
7161 kaddr = page_address(page);
7162
7163 while (len >= bits_to_clear) {
7164 kaddr[offset] &= ~mask_to_clear;
7165 len -= bits_to_clear;
7166 bits_to_clear = BITS_PER_BYTE;
7167 mask_to_clear = ~0;
7168 if (++offset >= PAGE_SIZE && len > 0) {
7169 offset = 0;
7170 page = eb->pages[++i];
7171 assert_eb_page_uptodate(eb, page);
7172 kaddr = page_address(page);
7173 }
7174 }
7175 if (len) {
7176 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
7177 kaddr[offset] &= ~mask_to_clear;
7178 }
7179 }
7180
7181 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
7182 {
7183 unsigned long distance = (src > dst) ? src - dst : dst - src;
7184 return distance < len;
7185 }
7186
7187 static void copy_pages(struct page *dst_page, struct page *src_page,
7188 unsigned long dst_off, unsigned long src_off,
7189 unsigned long len)
7190 {
7191 char *dst_kaddr = page_address(dst_page);
7192 char *src_kaddr;
7193 int must_memmove = 0;
7194
7195 if (dst_page != src_page) {
7196 src_kaddr = page_address(src_page);
7197 } else {
7198 src_kaddr = dst_kaddr;
7199 if (areas_overlap(src_off, dst_off, len))
7200 must_memmove = 1;
7201 }
7202
7203 if (must_memmove)
7204 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
7205 else
7206 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
7207 }
7208
7209 void memcpy_extent_buffer(const struct extent_buffer *dst,
7210 unsigned long dst_offset, unsigned long src_offset,
7211 unsigned long len)
7212 {
7213 size_t cur;
7214 size_t dst_off_in_page;
7215 size_t src_off_in_page;
7216 unsigned long dst_i;
7217 unsigned long src_i;
7218
7219 if (check_eb_range(dst, dst_offset, len) ||
7220 check_eb_range(dst, src_offset, len))
7221 return;
7222
7223 while (len > 0) {
7224 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
7225 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
7226
7227 dst_i = get_eb_page_index(dst_offset);
7228 src_i = get_eb_page_index(src_offset);
7229
7230 cur = min(len, (unsigned long)(PAGE_SIZE -
7231 src_off_in_page));
7232 cur = min_t(unsigned long, cur,
7233 (unsigned long)(PAGE_SIZE - dst_off_in_page));
7234
7235 copy_pages(dst->pages[dst_i], dst->pages[src_i],
7236 dst_off_in_page, src_off_in_page, cur);
7237
7238 src_offset += cur;
7239 dst_offset += cur;
7240 len -= cur;
7241 }
7242 }
7243
7244 void memmove_extent_buffer(const struct extent_buffer *dst,
7245 unsigned long dst_offset, unsigned long src_offset,
7246 unsigned long len)
7247 {
7248 size_t cur;
7249 size_t dst_off_in_page;
7250 size_t src_off_in_page;
7251 unsigned long dst_end = dst_offset + len - 1;
7252 unsigned long src_end = src_offset + len - 1;
7253 unsigned long dst_i;
7254 unsigned long src_i;
7255
7256 if (check_eb_range(dst, dst_offset, len) ||
7257 check_eb_range(dst, src_offset, len))
7258 return;
7259 if (dst_offset < src_offset) {
7260 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
7261 return;
7262 }
7263 while (len > 0) {
7264 dst_i = get_eb_page_index(dst_end);
7265 src_i = get_eb_page_index(src_end);
7266
7267 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
7268 src_off_in_page = get_eb_offset_in_page(dst, src_end);
7269
7270 cur = min_t(unsigned long, len, src_off_in_page + 1);
7271 cur = min(cur, dst_off_in_page + 1);
7272 copy_pages(dst->pages[dst_i], dst->pages[src_i],
7273 dst_off_in_page - cur + 1,
7274 src_off_in_page - cur + 1, cur);
7275
7276 dst_end -= cur;
7277 src_end -= cur;
7278 len -= cur;
7279 }
7280 }
7281
7282 #define GANG_LOOKUP_SIZE 16
7283 static struct extent_buffer *get_next_extent_buffer(
7284 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
7285 {
7286 struct extent_buffer *gang[GANG_LOOKUP_SIZE];
7287 struct extent_buffer *found = NULL;
7288 u64 page_start = page_offset(page);
7289 u64 cur = page_start;
7290
7291 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
7292 lockdep_assert_held(&fs_info->buffer_lock);
7293
7294 while (cur < page_start + PAGE_SIZE) {
7295 int ret;
7296 int i;
7297
7298 ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7299 (void **)gang, cur >> fs_info->sectorsize_bits,
7300 min_t(unsigned int, GANG_LOOKUP_SIZE,
7301 PAGE_SIZE / fs_info->nodesize));
7302 if (ret == 0)
7303 goto out;
7304 for (i = 0; i < ret; i++) {
7305
7306 if (gang[i]->start >= page_start + PAGE_SIZE)
7307 goto out;
7308
7309 if (gang[i]->start >= bytenr) {
7310 found = gang[i];
7311 goto out;
7312 }
7313 }
7314 cur = gang[ret - 1]->start + gang[ret - 1]->len;
7315 }
7316 out:
7317 return found;
7318 }
7319
7320 static int try_release_subpage_extent_buffer(struct page *page)
7321 {
7322 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7323 u64 cur = page_offset(page);
7324 const u64 end = page_offset(page) + PAGE_SIZE;
7325 int ret;
7326
7327 while (cur < end) {
7328 struct extent_buffer *eb = NULL;
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338 spin_lock(&fs_info->buffer_lock);
7339 eb = get_next_extent_buffer(fs_info, page, cur);
7340 if (!eb) {
7341
7342 spin_unlock(&fs_info->buffer_lock);
7343 break;
7344 }
7345 cur = eb->start + eb->len;
7346
7347
7348
7349
7350
7351 spin_lock(&eb->refs_lock);
7352 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7353 spin_unlock(&eb->refs_lock);
7354 spin_unlock(&fs_info->buffer_lock);
7355 break;
7356 }
7357 spin_unlock(&fs_info->buffer_lock);
7358
7359
7360
7361
7362
7363
7364 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7365 spin_unlock(&eb->refs_lock);
7366 break;
7367 }
7368
7369
7370
7371
7372
7373
7374 release_extent_buffer(eb);
7375 }
7376
7377
7378
7379
7380 spin_lock(&page->mapping->private_lock);
7381 if (!PagePrivate(page))
7382 ret = 1;
7383 else
7384 ret = 0;
7385 spin_unlock(&page->mapping->private_lock);
7386 return ret;
7387
7388 }
7389
7390 int try_release_extent_buffer(struct page *page)
7391 {
7392 struct extent_buffer *eb;
7393
7394 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
7395 return try_release_subpage_extent_buffer(page);
7396
7397
7398
7399
7400
7401 spin_lock(&page->mapping->private_lock);
7402 if (!PagePrivate(page)) {
7403 spin_unlock(&page->mapping->private_lock);
7404 return 1;
7405 }
7406
7407 eb = (struct extent_buffer *)page->private;
7408 BUG_ON(!eb);
7409
7410
7411
7412
7413
7414
7415 spin_lock(&eb->refs_lock);
7416 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7417 spin_unlock(&eb->refs_lock);
7418 spin_unlock(&page->mapping->private_lock);
7419 return 0;
7420 }
7421 spin_unlock(&page->mapping->private_lock);
7422
7423
7424
7425
7426
7427 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7428 spin_unlock(&eb->refs_lock);
7429 return 0;
7430 }
7431
7432 return release_extent_buffer(eb);
7433 }
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
7448 u64 bytenr, u64 owner_root, u64 gen, int level)
7449 {
7450 struct extent_buffer *eb;
7451 int ret;
7452
7453 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
7454 if (IS_ERR(eb))
7455 return;
7456
7457 if (btrfs_buffer_uptodate(eb, gen, 1)) {
7458 free_extent_buffer(eb);
7459 return;
7460 }
7461
7462 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7463 if (ret < 0)
7464 free_extent_buffer_stale(eb);
7465 else
7466 free_extent_buffer(eb);
7467 }
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477 void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7478 {
7479 btrfs_readahead_tree_block(node->fs_info,
7480 btrfs_node_blockptr(node, slot),
7481 btrfs_header_owner(node),
7482 btrfs_node_ptr_generation(node, slot),
7483 btrfs_header_level(node) - 1);
7484 }